feat: godot-engine-source-4.3-stable

This commit is contained in:
Jan van der Weide 2025-01-17 16:36:38 +01:00
parent c59a7dcade
commit 7125d019b5
11149 changed files with 5070401 additions and 0 deletions

View file

@ -0,0 +1,74 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
* Copyright (C) 2011-2012, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* file name: appendable.cpp
* encoding: UTF-8
* tab size: 8 (not used)
* indentation:4
*
* created on: 2010dec07
* created by: Markus W. Scherer
*/
#include "unicode/utypes.h"
#include "unicode/appendable.h"
#include "unicode/utf16.h"
U_NAMESPACE_BEGIN
Appendable::~Appendable() {}
UBool
Appendable::appendCodePoint(UChar32 c) {
if(c<=0xffff) {
return appendCodeUnit((char16_t)c);
} else {
return appendCodeUnit(U16_LEAD(c)) && appendCodeUnit(U16_TRAIL(c));
}
}
UBool
Appendable::appendString(const char16_t *s, int32_t length) {
if(length<0) {
char16_t c;
while((c=*s++)!=0) {
if(!appendCodeUnit(c)) {
return false;
}
}
} else if(length>0) {
const char16_t *limit=s+length;
do {
if(!appendCodeUnit(*s++)) {
return false;
}
} while(s<limit);
}
return true;
}
UBool
Appendable::reserveAppendCapacity(int32_t /*appendCapacity*/) {
return true;
}
char16_t *
Appendable::getAppendBuffer(int32_t minCapacity,
int32_t /*desiredCapacityHint*/,
char16_t *scratch, int32_t scratchCapacity,
int32_t *resultCapacity) {
if(minCapacity<1 || scratchCapacity<minCapacity) {
*resultCapacity=0;
return nullptr;
}
*resultCapacity=scratchCapacity;
return scratch;
}
// UnicodeStringAppendable is implemented in unistr.cpp.
U_NAMESPACE_END

View file

@ -0,0 +1,741 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
******************************************************************************
*
* Copyright (C) 2007-2012, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
* file name: bmpset.cpp
* encoding: UTF-8
* tab size: 8 (not used)
* indentation:4
*
* created on: 2007jan29
* created by: Markus W. Scherer
*/
#include "unicode/utypes.h"
#include "unicode/uniset.h"
#include "unicode/utf8.h"
#include "unicode/utf16.h"
#include "cmemory.h"
#include "bmpset.h"
#include "uassert.h"
U_NAMESPACE_BEGIN
BMPSet::BMPSet(const int32_t *parentList, int32_t parentListLength) :
list(parentList), listLength(parentListLength) {
uprv_memset(latin1Contains, 0, sizeof(latin1Contains));
uprv_memset(table7FF, 0, sizeof(table7FF));
uprv_memset(bmpBlockBits, 0, sizeof(bmpBlockBits));
/*
* Set the list indexes for binary searches for
* U+0800, U+1000, U+2000, .., U+F000, U+10000.
* U+0800 is the first 3-byte-UTF-8 code point. Lower code points are
* looked up in the bit tables.
* The last pair of indexes is for finding supplementary code points.
*/
list4kStarts[0]=findCodePoint(0x800, 0, listLength-1);
int32_t i;
for(i=1; i<=0x10; ++i) {
list4kStarts[i]=findCodePoint(i<<12, list4kStarts[i-1], listLength-1);
}
list4kStarts[0x11]=listLength-1;
containsFFFD=containsSlow(0xfffd, list4kStarts[0xf], list4kStarts[0x10]);
initBits();
overrideIllegal();
}
BMPSet::BMPSet(const BMPSet &otherBMPSet, const int32_t *newParentList, int32_t newParentListLength) :
containsFFFD(otherBMPSet.containsFFFD),
list(newParentList), listLength(newParentListLength) {
uprv_memcpy(latin1Contains, otherBMPSet.latin1Contains, sizeof(latin1Contains));
uprv_memcpy(table7FF, otherBMPSet.table7FF, sizeof(table7FF));
uprv_memcpy(bmpBlockBits, otherBMPSet.bmpBlockBits, sizeof(bmpBlockBits));
uprv_memcpy(list4kStarts, otherBMPSet.list4kStarts, sizeof(list4kStarts));
}
BMPSet::~BMPSet() {
}
/*
* Set bits in a bit rectangle in "vertical" bit organization.
* start<limit<=0x800
*/
static void set32x64Bits(uint32_t table[64], int32_t start, int32_t limit) {
U_ASSERT(start<limit);
U_ASSERT(limit<=0x800);
int32_t lead=start>>6; // Named for UTF-8 2-byte lead byte with upper 5 bits.
int32_t trail=start&0x3f; // Named for UTF-8 2-byte trail byte with lower 6 bits.
// Set one bit indicating an all-one block.
uint32_t bits=(uint32_t)1<<lead;
if((start+1)==limit) { // Single-character shortcut.
table[trail]|=bits;
return;
}
int32_t limitLead=limit>>6;
int32_t limitTrail=limit&0x3f;
if(lead==limitLead) {
// Partial vertical bit column.
while(trail<limitTrail) {
table[trail++]|=bits;
}
} else {
// Partial vertical bit column,
// followed by a bit rectangle,
// followed by another partial vertical bit column.
if(trail>0) {
do {
table[trail++]|=bits;
} while(trail<64);
++lead;
}
if(lead<limitLead) {
bits=~(((unsigned)1<<lead)-1);
if(limitLead<0x20) {
bits&=((unsigned)1<<limitLead)-1;
}
for(trail=0; trail<64; ++trail) {
table[trail]|=bits;
}
}
// limit<=0x800. If limit==0x800 then limitLead=32 and limitTrail=0.
// In that case, bits=1<<limitLead is undefined but the bits value
// is not used because trail<limitTrail is already false.
bits=(uint32_t)1<<((limitLead == 0x20) ? (limitLead - 1) : limitLead);
for(trail=0; trail<limitTrail; ++trail) {
table[trail]|=bits;
}
}
}
void BMPSet::initBits() {
UChar32 start, limit;
int32_t listIndex=0;
// Set latin1Contains[].
do {
start=list[listIndex++];
if(listIndex<listLength) {
limit=list[listIndex++];
} else {
limit=0x110000;
}
if(start>=0x100) {
break;
}
do {
latin1Contains[start++]=1;
} while(start<limit && start<0x100);
} while(limit<=0x100);
// Find the first range overlapping with (or after) 80..FF again,
// to include them in table7FF as well.
for(listIndex=0;;) {
start=list[listIndex++];
if(listIndex<listLength) {
limit=list[listIndex++];
} else {
limit=0x110000;
}
if(limit>0x80) {
if(start<0x80) {
start=0x80;
}
break;
}
}
// Set table7FF[].
while(start<0x800) {
set32x64Bits(table7FF, start, limit<=0x800 ? limit : 0x800);
if(limit>0x800) {
start=0x800;
break;
}
start=list[listIndex++];
if(listIndex<listLength) {
limit=list[listIndex++];
} else {
limit=0x110000;
}
}
// Set bmpBlockBits[].
int32_t minStart=0x800;
while(start<0x10000) {
if(limit>0x10000) {
limit=0x10000;
}
if(start<minStart) {
start=minStart;
}
if(start<limit) { // Else: Another range entirely in a known mixed-value block.
if(start&0x3f) {
// Mixed-value block of 64 code points.
start>>=6;
bmpBlockBits[start&0x3f]|=0x10001<<(start>>6);
start=(start+1)<<6; // Round up to the next block boundary.
minStart=start; // Ignore further ranges in this block.
}
if(start<limit) {
if(start<(limit&~0x3f)) {
// Multiple all-ones blocks of 64 code points each.
set32x64Bits(bmpBlockBits, start>>6, limit>>6);
}
if(limit&0x3f) {
// Mixed-value block of 64 code points.
limit>>=6;
bmpBlockBits[limit&0x3f]|=0x10001<<(limit>>6);
limit=(limit+1)<<6; // Round up to the next block boundary.
minStart=limit; // Ignore further ranges in this block.
}
}
}
if(limit==0x10000) {
break;
}
start=list[listIndex++];
if(listIndex<listLength) {
limit=list[listIndex++];
} else {
limit=0x110000;
}
}
}
/*
* Override some bits and bytes to the result of contains(FFFD)
* for faster validity checking at runtime.
* No need to set 0 values where they were reset to 0 in the constructor
* and not modified by initBits().
* (table7FF[] 0..7F, bmpBlockBits[] 0..7FF)
* Need to set 0 values for surrogates D800..DFFF.
*/
void BMPSet::overrideIllegal() {
uint32_t bits, mask;
int32_t i;
if(containsFFFD) {
bits=3; // Lead bytes 0xC0 and 0xC1.
for(i=0; i<64; ++i) {
table7FF[i]|=bits;
}
bits=1; // Lead byte 0xE0.
for(i=0; i<32; ++i) { // First half of 4k block.
bmpBlockBits[i]|=bits;
}
mask= static_cast<uint32_t>(~(0x10001<<0xd)); // Lead byte 0xED.
bits=1<<0xd;
for(i=32; i<64; ++i) { // Second half of 4k block.
bmpBlockBits[i]=(bmpBlockBits[i]&mask)|bits;
}
} else {
mask= static_cast<uint32_t>(~(0x10001<<0xd)); // Lead byte 0xED.
for(i=32; i<64; ++i) { // Second half of 4k block.
bmpBlockBits[i]&=mask;
}
}
}
int32_t BMPSet::findCodePoint(UChar32 c, int32_t lo, int32_t hi) const {
/* Examples:
findCodePoint(c)
set list[] c=0 1 3 4 7 8
=== ============== ===========
[] [110000] 0 0 0 0 0 0
[\u0000-\u0003] [0, 4, 110000] 1 1 1 2 2 2
[\u0004-\u0007] [4, 8, 110000] 0 0 0 1 1 2
[:Any:] [0, 110000] 1 1 1 1 1 1
*/
// Return the smallest i such that c < list[i]. Assume
// list[len - 1] == HIGH and that c is legal (0..HIGH-1).
if (c < list[lo])
return lo;
// High runner test. c is often after the last range, so an
// initial check for this condition pays off.
if (lo >= hi || c >= list[hi-1])
return hi;
// invariant: c >= list[lo]
// invariant: c < list[hi]
for (;;) {
int32_t i = (lo + hi) >> 1;
if (i == lo) {
break; // Found!
} else if (c < list[i]) {
hi = i;
} else {
lo = i;
}
}
return hi;
}
UBool
BMPSet::contains(UChar32 c) const {
if((uint32_t)c<=0xff) {
return (UBool)latin1Contains[c];
} else if((uint32_t)c<=0x7ff) {
return (UBool)((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0);
} else if((uint32_t)c<0xd800 || (c>=0xe000 && c<=0xffff)) {
int lead=c>>12;
uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
if(twoBits<=1) {
// All 64 code points with the same bits 15..6
// are either in the set or not.
return (UBool)twoBits;
} else {
// Look up the code point in its 4k block of code points.
return containsSlow(c, list4kStarts[lead], list4kStarts[lead+1]);
}
} else if((uint32_t)c<=0x10ffff) {
// surrogate or supplementary code point
return containsSlow(c, list4kStarts[0xd], list4kStarts[0x11]);
} else {
// Out-of-range code points get false, consistent with long-standing
// behavior of UnicodeSet::contains(c).
return false;
}
}
/*
* Check for sufficient length for trail unit for each surrogate pair.
* Handle single surrogates as surrogate code points as usual in ICU.
*/
const char16_t *
BMPSet::span(const char16_t *s, const char16_t *limit, USetSpanCondition spanCondition) const {
char16_t c, c2;
if(spanCondition) {
// span
do {
c=*s;
if(c<=0xff) {
if(!latin1Contains[c]) {
break;
}
} else if(c<=0x7ff) {
if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))==0) {
break;
}
} else if(c<0xd800 || c>=0xe000) {
int lead=c>>12;
uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
if(twoBits<=1) {
// All 64 code points with the same bits 15..6
// are either in the set or not.
if(twoBits==0) {
break;
}
} else {
// Look up the code point in its 4k block of code points.
if(!containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
break;
}
}
} else if(c>=0xdc00 || (s+1)==limit || (c2=s[1])<0xdc00 || c2>=0xe000) {
// surrogate code point
if(!containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
break;
}
} else {
// surrogate pair
if(!containsSlow(U16_GET_SUPPLEMENTARY(c, c2), list4kStarts[0x10], list4kStarts[0x11])) {
break;
}
++s;
}
} while(++s<limit);
} else {
// span not
do {
c=*s;
if(c<=0xff) {
if(latin1Contains[c]) {
break;
}
} else if(c<=0x7ff) {
if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) {
break;
}
} else if(c<0xd800 || c>=0xe000) {
int lead=c>>12;
uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
if(twoBits<=1) {
// All 64 code points with the same bits 15..6
// are either in the set or not.
if(twoBits!=0) {
break;
}
} else {
// Look up the code point in its 4k block of code points.
if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
break;
}
}
} else if(c>=0xdc00 || (s+1)==limit || (c2=s[1])<0xdc00 || c2>=0xe000) {
// surrogate code point
if(containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
break;
}
} else {
// surrogate pair
if(containsSlow(U16_GET_SUPPLEMENTARY(c, c2), list4kStarts[0x10], list4kStarts[0x11])) {
break;
}
++s;
}
} while(++s<limit);
}
return s;
}
/* Symmetrical with span(). */
const char16_t *
BMPSet::spanBack(const char16_t *s, const char16_t *limit, USetSpanCondition spanCondition) const {
char16_t c, c2;
if(spanCondition) {
// span
for(;;) {
c=*(--limit);
if(c<=0xff) {
if(!latin1Contains[c]) {
break;
}
} else if(c<=0x7ff) {
if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))==0) {
break;
}
} else if(c<0xd800 || c>=0xe000) {
int lead=c>>12;
uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
if(twoBits<=1) {
// All 64 code points with the same bits 15..6
// are either in the set or not.
if(twoBits==0) {
break;
}
} else {
// Look up the code point in its 4k block of code points.
if(!containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
break;
}
}
} else if(c<0xdc00 || s==limit || (c2=*(limit-1))<0xd800 || c2>=0xdc00) {
// surrogate code point
if(!containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
break;
}
} else {
// surrogate pair
if(!containsSlow(U16_GET_SUPPLEMENTARY(c2, c), list4kStarts[0x10], list4kStarts[0x11])) {
break;
}
--limit;
}
if(s==limit) {
return s;
}
}
} else {
// span not
for(;;) {
c=*(--limit);
if(c<=0xff) {
if(latin1Contains[c]) {
break;
}
} else if(c<=0x7ff) {
if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) {
break;
}
} else if(c<0xd800 || c>=0xe000) {
int lead=c>>12;
uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
if(twoBits<=1) {
// All 64 code points with the same bits 15..6
// are either in the set or not.
if(twoBits!=0) {
break;
}
} else {
// Look up the code point in its 4k block of code points.
if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
break;
}
}
} else if(c<0xdc00 || s==limit || (c2=*(limit-1))<0xd800 || c2>=0xdc00) {
// surrogate code point
if(containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
break;
}
} else {
// surrogate pair
if(containsSlow(U16_GET_SUPPLEMENTARY(c2, c), list4kStarts[0x10], list4kStarts[0x11])) {
break;
}
--limit;
}
if(s==limit) {
return s;
}
}
}
return limit+1;
}
/*
* Precheck for sufficient trail bytes at end of string only once per span.
* Check validity.
*/
const uint8_t *
BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
const uint8_t *limit=s+length;
uint8_t b=*s;
if(U8_IS_SINGLE(b)) {
// Initial all-ASCII span.
if(spanCondition) {
do {
if(!latin1Contains[b] || ++s==limit) {
return s;
}
b=*s;
} while(U8_IS_SINGLE(b));
} else {
do {
if(latin1Contains[b] || ++s==limit) {
return s;
}
b=*s;
} while(U8_IS_SINGLE(b));
}
length=(int32_t)(limit-s);
}
if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
}
const uint8_t *limit0=limit;
/*
* Make sure that the last 1/2/3/4-byte sequence before limit is complete
* or runs into a lead byte.
* In the span loop compare s with limit only once
* per multi-byte character.
*
* Give a trailing illegal sequence the same value as the result of contains(FFFD),
* including it if that is part of the span, otherwise set limit0 to before
* the truncated sequence.
*/
b=*(limit-1);
if((int8_t)b<0) {
// b>=0x80: lead or trail byte
if(b<0xc0) {
// single trail byte, check for preceding 3- or 4-byte lead byte
if(length>=2 && (b=*(limit-2))>=0xe0) {
limit-=2;
if(containsFFFD!=spanCondition) {
limit0=limit;
}
} else if(b<0xc0 && b>=0x80 && length>=3 && (b=*(limit-3))>=0xf0) {
// 4-byte lead byte with only two trail bytes
limit-=3;
if(containsFFFD!=spanCondition) {
limit0=limit;
}
}
} else {
// lead byte with no trail bytes
--limit;
if(containsFFFD!=spanCondition) {
limit0=limit;
}
}
}
uint8_t t1, t2, t3;
while(s<limit) {
b=*s;
if(U8_IS_SINGLE(b)) {
// ASCII
if(spanCondition) {
do {
if(!latin1Contains[b]) {
return s;
} else if(++s==limit) {
return limit0;
}
b=*s;
} while(U8_IS_SINGLE(b));
} else {
do {
if(latin1Contains[b]) {
return s;
} else if(++s==limit) {
return limit0;
}
b=*s;
} while(U8_IS_SINGLE(b));
}
}
++s; // Advance past the lead byte.
if(b>=0xe0) {
if(b<0xf0) {
if( /* handle U+0000..U+FFFF inline */
(t1=(uint8_t)(s[0]-0x80)) <= 0x3f &&
(t2=(uint8_t)(s[1]-0x80)) <= 0x3f
) {
b&=0xf;
uint32_t twoBits=(bmpBlockBits[t1]>>b)&0x10001;
if(twoBits<=1) {
// All 64 code points with this lead byte and middle trail byte
// are either in the set or not.
if(twoBits!=(uint32_t)spanCondition) {
return s-1;
}
} else {
// Look up the code point in its 4k block of code points.
UChar32 c=(b<<12)|(t1<<6)|t2;
if(containsSlow(c, list4kStarts[b], list4kStarts[b+1]) != spanCondition) {
return s-1;
}
}
s+=2;
continue;
}
} else if( /* handle U+10000..U+10FFFF inline */
(t1=(uint8_t)(s[0]-0x80)) <= 0x3f &&
(t2=(uint8_t)(s[1]-0x80)) <= 0x3f &&
(t3=(uint8_t)(s[2]-0x80)) <= 0x3f
) {
// Give an illegal sequence the same value as the result of contains(FFFD).
UChar32 c=((UChar32)(b-0xf0)<<18)|((UChar32)t1<<12)|(t2<<6)|t3;
if( ( (0x10000<=c && c<=0x10ffff) ?
containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) :
containsFFFD
) != spanCondition
) {
return s-1;
}
s+=3;
continue;
}
} else {
if( /* handle U+0000..U+07FF inline */
b>=0xc0 &&
(t1=(uint8_t)(*s-0x80)) <= 0x3f
) {
if((USetSpanCondition)((table7FF[t1]&((uint32_t)1<<(b&0x1f)))!=0) != spanCondition) {
return s-1;
}
++s;
continue;
}
}
// Give an illegal sequence the same value as the result of contains(FFFD).
// Handle each byte of an illegal sequence separately to simplify the code;
// no need to optimize error handling.
if(containsFFFD!=spanCondition) {
return s-1;
}
}
return limit0;
}
/*
* While going backwards through UTF-8 optimize only for ASCII.
* Unlike UTF-16, UTF-8 is not forward-backward symmetrical, that is, it is not
* possible to tell from the last byte in a multi-byte sequence how many
* preceding bytes there should be. Therefore, going backwards through UTF-8
* is much harder than going forward.
*/
int32_t
BMPSet::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
}
uint8_t b;
do {
b=s[--length];
if(U8_IS_SINGLE(b)) {
// ASCII sub-span
if(spanCondition) {
do {
if(!latin1Contains[b]) {
return length+1;
} else if(length==0) {
return 0;
}
b=s[--length];
} while(U8_IS_SINGLE(b));
} else {
do {
if(latin1Contains[b]) {
return length+1;
} else if(length==0) {
return 0;
}
b=s[--length];
} while(U8_IS_SINGLE(b));
}
}
int32_t prev=length;
UChar32 c;
// trail byte: collect a multi-byte character
// (or lead byte in last-trail position)
c=utf8_prevCharSafeBody(s, 0, &length, b, -3);
// c is a valid code point, not ASCII, not a surrogate
if(c<=0x7ff) {
if((USetSpanCondition)((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) != spanCondition) {
return prev+1;
}
} else if(c<=0xffff) {
int lead=c>>12;
uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
if(twoBits<=1) {
// All 64 code points with the same bits 15..6
// are either in the set or not.
if(twoBits!=(uint32_t)spanCondition) {
return prev+1;
}
} else {
// Look up the code point in its 4k block of code points.
if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1]) != spanCondition) {
return prev+1;
}
}
} else {
if(containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) != spanCondition) {
return prev+1;
}
}
} while(length>0);
return 0;
}
U_NAMESPACE_END

164
engine/thirdparty/icu4c/common/bmpset.h vendored Normal file
View file

@ -0,0 +1,164 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
******************************************************************************
*
* Copyright (C) 2007, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
* file name: bmpset.h
* encoding: UTF-8
* tab size: 8 (not used)
* indentation:4
*
* created on: 2007jan29
* created by: Markus W. Scherer
*/
#ifndef __BMPSET_H__
#define __BMPSET_H__
#include "unicode/utypes.h"
#include "unicode/uniset.h"
U_NAMESPACE_BEGIN
/*
* Helper class for frozen UnicodeSets, implements contains() and span()
* optimized for BMP code points. Structured to be UTF-8-friendly.
*
* Latin-1: Look up bytes.
* 2-byte characters: Bits organized vertically.
* 3-byte characters: Use zero/one/mixed data per 64-block in U+0000..U+FFFF,
* with mixed for illegal ranges.
* Supplementary characters: Binary search over
* the supplementary part of the parent set's inversion list.
*/
class BMPSet : public UMemory {
public:
BMPSet(const int32_t *parentList, int32_t parentListLength);
BMPSet(const BMPSet &otherBMPSet, const int32_t *newParentList, int32_t newParentListLength);
virtual ~BMPSet();
virtual UBool contains(UChar32 c) const;
/*
* Span the initial substring for which each character c has spanCondition==contains(c).
* It must be s<limit and spanCondition==0 or 1.
* @return The string pointer which limits the span.
*/
const char16_t *span(const char16_t *s, const char16_t *limit, USetSpanCondition spanCondition) const;
/*
* Span the trailing substring for which each character c has spanCondition==contains(c).
* It must be s<limit and spanCondition==0 or 1.
* @return The string pointer which starts the span.
*/
const char16_t *spanBack(const char16_t *s, const char16_t *limit, USetSpanCondition spanCondition) const;
/*
* Span the initial substring for which each character c has spanCondition==contains(c).
* It must be length>0 and spanCondition==0 or 1.
* @return The string pointer which limits the span.
*/
const uint8_t *spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const;
/*
* Span the trailing substring for which each character c has spanCondition==contains(c).
* It must be length>0 and spanCondition==0 or 1.
* @return The start of the span.
*/
int32_t spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const;
private:
void initBits();
void overrideIllegal();
/**
* Same as UnicodeSet::findCodePoint(UChar32 c) const except that the
* binary search is restricted for finding code points in a certain range.
*
* For restricting the search for finding in the range start..end,
* pass in
* lo=findCodePoint(start) and
* hi=findCodePoint(end)
* with 0<=lo<=hi<len.
* findCodePoint(c) defaults to lo=0 and hi=len-1.
*
* @param c a character in a subrange of MIN_VALUE..MAX_VALUE
* @param lo The lowest index to be returned.
* @param hi The highest index to be returned.
* @return the smallest integer i in the range lo..hi,
* inclusive, such that c < list[i]
*/
int32_t findCodePoint(UChar32 c, int32_t lo, int32_t hi) const;
inline UBool containsSlow(UChar32 c, int32_t lo, int32_t hi) const;
/*
* One byte 0 or 1 per Latin-1 character.
*/
UBool latin1Contains[0x100];
/* true if contains(U+FFFD). */
UBool containsFFFD;
/*
* One bit per code point from U+0000..U+07FF.
* The bits are organized vertically; consecutive code points
* correspond to the same bit positions in consecutive table words.
* With code point parts
* lead=c{10..6}
* trail=c{5..0}
* it is set.contains(c)==(table7FF[trail] bit lead)
*
* Bits for 0..7F (non-shortest forms) are set to the result of contains(FFFD)
* for faster validity checking at runtime.
*/
uint32_t table7FF[64];
/*
* One bit per 64 BMP code points.
* The bits are organized vertically; consecutive 64-code point blocks
* correspond to the same bit position in consecutive table words.
* With code point parts
* lead=c{15..12}
* t1=c{11..6}
* test bits (lead+16) and lead in bmpBlockBits[t1].
* If the upper bit is 0, then the lower bit indicates if contains(c)
* for all code points in the 64-block.
* If the upper bit is 1, then the block is mixed and set.contains(c)
* must be called.
*
* Bits for 0..7FF (non-shortest forms) and D800..DFFF are set to
* the result of contains(FFFD) for faster validity checking at runtime.
*/
uint32_t bmpBlockBits[64];
/*
* Inversion list indexes for restricted binary searches in
* findCodePoint(), from
* findCodePoint(U+0800, U+1000, U+2000, .., U+F000, U+10000).
* U+0800 is the first 3-byte-UTF-8 code point. Code points below U+0800 are
* always looked up in the bit tables.
* The last pair of indexes is for finding supplementary code points.
*/
int32_t list4kStarts[18];
/*
* The inversion list of the parent set, for the slower contains() implementation
* for mixed BMP blocks and for supplementary code points.
* The list is terminated with list[listLength-1]=0x110000.
*/
const int32_t *list;
int32_t listLength;
};
inline UBool BMPSet::containsSlow(UChar32 c, int32_t lo, int32_t hi) const {
return (UBool)(findCodePoint(c, lo, hi) & 1);
}
U_NAMESPACE_END
#endif

View file

@ -0,0 +1,367 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
************************************************************************************
* Copyright (C) 2006-2016, International Business Machines Corporation
* and others. All Rights Reserved.
************************************************************************************
*/
#include "unicode/utypes.h"
#if !UCONFIG_NO_BREAK_ITERATION
#include "unicode/uchar.h"
#include "unicode/uniset.h"
#include "unicode/chariter.h"
#include "unicode/ures.h"
#include "unicode/udata.h"
#include "unicode/putil.h"
#include "unicode/ustring.h"
#include "unicode/uscript.h"
#include "unicode/ucharstrie.h"
#include "unicode/bytestrie.h"
#include "unicode/rbbi.h"
#include "brkeng.h"
#include "cmemory.h"
#include "dictbe.h"
#include "lstmbe.h"
#include "charstr.h"
#include "dictionarydata.h"
#include "mutex.h"
#include "uvector.h"
#include "umutex.h"
#include "uresimp.h"
#include "ubrkimpl.h"
U_NAMESPACE_BEGIN
/*
******************************************************************
*/
LanguageBreakEngine::LanguageBreakEngine() {
}
LanguageBreakEngine::~LanguageBreakEngine() {
}
/*
******************************************************************
*/
LanguageBreakFactory::LanguageBreakFactory() {
}
LanguageBreakFactory::~LanguageBreakFactory() {
}
/*
******************************************************************
*/
UnhandledEngine::UnhandledEngine(UErrorCode &status) : fHandled(nullptr) {
(void)status;
}
UnhandledEngine::~UnhandledEngine() {
delete fHandled;
fHandled = nullptr;
}
UBool
UnhandledEngine::handles(UChar32 c, const char* locale) const {
(void)locale; // Unused
return fHandled && fHandled->contains(c);
}
int32_t
UnhandledEngine::findBreaks( UText *text,
int32_t startPos,
int32_t endPos,
UVector32 &/*foundBreaks*/,
UBool /* isPhraseBreaking */,
UErrorCode &status) const {
if (U_FAILURE(status)) return 0;
utext_setNativeIndex(text, startPos);
UChar32 c = utext_current32(text);
while((int32_t)utext_getNativeIndex(text) < endPos && fHandled->contains(c)) {
utext_next32(text); // TODO: recast loop to work with post-increment operations.
c = utext_current32(text);
}
return 0;
}
void
UnhandledEngine::handleCharacter(UChar32 c) {
if (fHandled == nullptr) {
fHandled = new UnicodeSet();
if (fHandled == nullptr) {
return;
}
}
if (!fHandled->contains(c)) {
UErrorCode status = U_ZERO_ERROR;
// Apply the entire script of the character.
int32_t script = u_getIntPropertyValue(c, UCHAR_SCRIPT);
fHandled->applyIntPropertyValue(UCHAR_SCRIPT, script, status);
}
}
/*
******************************************************************
*/
ICULanguageBreakFactory::ICULanguageBreakFactory(UErrorCode &/*status*/) {
fEngines = nullptr;
}
ICULanguageBreakFactory::~ICULanguageBreakFactory() {
delete fEngines;
}
void ICULanguageBreakFactory::ensureEngines(UErrorCode& status) {
static UMutex gBreakEngineMutex;
Mutex m(&gBreakEngineMutex);
if (fEngines == nullptr) {
LocalPointer<UStack> engines(new UStack(uprv_deleteUObject, nullptr, status), status);
if (U_SUCCESS(status)) {
fEngines = engines.orphan();
}
}
}
const LanguageBreakEngine *
ICULanguageBreakFactory::getEngineFor(UChar32 c, const char* locale) {
const LanguageBreakEngine *lbe = nullptr;
UErrorCode status = U_ZERO_ERROR;
ensureEngines(status);
if (U_FAILURE(status) ) {
// Note: no way to return error code to caller.
return nullptr;
}
static UMutex gBreakEngineMutex;
Mutex m(&gBreakEngineMutex);
int32_t i = fEngines->size();
while (--i >= 0) {
lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i));
if (lbe != nullptr && lbe->handles(c, locale)) {
return lbe;
}
}
// We didn't find an engine. Create one.
lbe = loadEngineFor(c, locale);
if (lbe != nullptr) {
fEngines->push((void *)lbe, status);
}
return U_SUCCESS(status) ? lbe : nullptr;
}
const LanguageBreakEngine *
ICULanguageBreakFactory::loadEngineFor(UChar32 c, const char*) {
UErrorCode status = U_ZERO_ERROR;
UScriptCode code = uscript_getScript(c, &status);
if (U_SUCCESS(status)) {
const LanguageBreakEngine *engine = nullptr;
// Try to use LSTM first
const LSTMData *data = CreateLSTMDataForScript(code, status);
if (U_SUCCESS(status)) {
if (data != nullptr) {
engine = CreateLSTMBreakEngine(code, data, status);
if (U_SUCCESS(status) && engine != nullptr) {
return engine;
}
if (engine != nullptr) {
delete engine;
engine = nullptr;
} else {
DeleteLSTMData(data);
}
}
}
status = U_ZERO_ERROR; // fallback to dictionary based
DictionaryMatcher *m = loadDictionaryMatcherFor(code);
if (m != nullptr) {
switch(code) {
case USCRIPT_THAI:
engine = new ThaiBreakEngine(m, status);
break;
case USCRIPT_LAO:
engine = new LaoBreakEngine(m, status);
break;
case USCRIPT_MYANMAR:
engine = new BurmeseBreakEngine(m, status);
break;
case USCRIPT_KHMER:
engine = new KhmerBreakEngine(m, status);
break;
#if !UCONFIG_NO_NORMALIZATION
// CJK not available w/o normalization
case USCRIPT_HANGUL:
engine = new CjkBreakEngine(m, kKorean, status);
break;
// use same BreakEngine and dictionary for both Chinese and Japanese
case USCRIPT_HIRAGANA:
case USCRIPT_KATAKANA:
case USCRIPT_HAN:
engine = new CjkBreakEngine(m, kChineseJapanese, status);
break;
#if 0
// TODO: Have to get some characters with script=common handled
// by CjkBreakEngine (e.g. U+309B). Simply subjecting
// them to CjkBreakEngine does not work. The engine has to
// special-case them.
case USCRIPT_COMMON:
{
UBlockCode block = ublock_getCode(code);
if (block == UBLOCK_HIRAGANA || block == UBLOCK_KATAKANA)
engine = new CjkBreakEngine(dict, kChineseJapanese, status);
break;
}
#endif
#endif
default:
break;
}
if (engine == nullptr) {
delete m;
}
else if (U_FAILURE(status)) {
delete engine;
engine = nullptr;
}
return engine;
}
}
return nullptr;
}
DictionaryMatcher *
ICULanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script) {
UErrorCode status = U_ZERO_ERROR;
// open root from brkitr tree.
UResourceBundle *b = ures_open(U_ICUDATA_BRKITR, "", &status);
b = ures_getByKeyWithFallback(b, "dictionaries", b, &status);
int32_t dictnlength = 0;
const char16_t *dictfname =
ures_getStringByKeyWithFallback(b, uscript_getShortName(script), &dictnlength, &status);
if (U_FAILURE(status)) {
ures_close(b);
return nullptr;
}
CharString dictnbuf;
CharString ext;
const char16_t *extStart = u_memrchr(dictfname, 0x002e, dictnlength); // last dot
if (extStart != nullptr) {
int32_t len = (int32_t)(extStart - dictfname);
ext.appendInvariantChars(UnicodeString(false, extStart + 1, dictnlength - len - 1), status);
dictnlength = len;
}
dictnbuf.appendInvariantChars(UnicodeString(false, dictfname, dictnlength), status);
ures_close(b);
UDataMemory *file = udata_open(U_ICUDATA_BRKITR, ext.data(), dictnbuf.data(), &status);
if (U_SUCCESS(status)) {
// build trie
const uint8_t *data = (const uint8_t *)udata_getMemory(file);
const int32_t *indexes = (const int32_t *)data;
const int32_t offset = indexes[DictionaryData::IX_STRING_TRIE_OFFSET];
const int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;
DictionaryMatcher *m = nullptr;
if (trieType == DictionaryData::TRIE_TYPE_BYTES) {
const int32_t transform = indexes[DictionaryData::IX_TRANSFORM];
const char *characters = (const char *)(data + offset);
m = new BytesDictionaryMatcher(characters, transform, file);
}
else if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {
const char16_t *characters = (const char16_t *)(data + offset);
m = new UCharsDictionaryMatcher(characters, file);
}
if (m == nullptr) {
// no matcher exists to take ownership - either we are an invalid
// type or memory allocation failed
udata_close(file);
}
return m;
} else if (dictfname != nullptr) {
// we don't have a dictionary matcher.
// returning nullptr here will cause us to fail to find a dictionary break engine, as expected
status = U_ZERO_ERROR;
return nullptr;
}
return nullptr;
}
void ICULanguageBreakFactory::addExternalEngine(
ExternalBreakEngine* external, UErrorCode& status) {
LocalPointer<ExternalBreakEngine> engine(external, status);
ensureEngines(status);
LocalPointer<BreakEngineWrapper> wrapper(
new BreakEngineWrapper(engine.orphan(), status), status);
static UMutex gBreakEngineMutex;
Mutex m(&gBreakEngineMutex);
fEngines->push(wrapper.getAlias(), status);
wrapper.orphan();
}
BreakEngineWrapper::BreakEngineWrapper(
ExternalBreakEngine* engine, UErrorCode &status) : delegate(engine, status) {
}
BreakEngineWrapper::~BreakEngineWrapper() {
}
UBool BreakEngineWrapper::handles(UChar32 c, const char* locale) const {
return delegate->isFor(c, locale);
}
int32_t BreakEngineWrapper::findBreaks(
UText *text,
int32_t startPos,
int32_t endPos,
UVector32 &foundBreaks,
UBool /* isPhraseBreaking */,
UErrorCode &status) const {
if (U_FAILURE(status)) return 0;
int32_t result = 0;
// Find the span of characters included in the set.
// The span to break begins at the current position in the text, and
// extends towards the start or end of the text, depending on 'reverse'.
utext_setNativeIndex(text, startPos);
int32_t start = (int32_t)utext_getNativeIndex(text);
int32_t current;
int32_t rangeStart;
int32_t rangeEnd;
UChar32 c = utext_current32(text);
while((current = (int32_t)utext_getNativeIndex(text)) < endPos && delegate->handles(c)) {
utext_next32(text); // TODO: recast loop for postincrement
c = utext_current32(text);
}
rangeStart = start;
rangeEnd = current;
int32_t beforeSize = foundBreaks.size();
int32_t additionalCapacity = rangeEnd - rangeStart + 1;
// enlarge to contains (rangeEnd-rangeStart+1) more items
foundBreaks.ensureCapacity(beforeSize+additionalCapacity, status);
if (U_FAILURE(status)) return 0;
foundBreaks.setSize(beforeSize + beforeSize+additionalCapacity);
result = delegate->fillBreaks(text, rangeStart, rangeEnd, foundBreaks.getBuffer()+beforeSize,
additionalCapacity, status);
if (U_FAILURE(status)) return 0;
foundBreaks.setSize(beforeSize + result);
utext_setNativeIndex(text, current);
return result;
}
U_NAMESPACE_END
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */

324
engine/thirdparty/icu4c/common/brkeng.h vendored Normal file
View file

@ -0,0 +1,324 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/**
************************************************************************************
* Copyright (C) 2006-2012, International Business Machines Corporation and others. *
* All Rights Reserved. *
************************************************************************************
*/
#ifndef BRKENG_H
#define BRKENG_H
#include "unicode/umisc.h"
#include "unicode/utypes.h"
#include "unicode/uobject.h"
#include "unicode/utext.h"
#include "unicode/uscript.h"
U_NAMESPACE_BEGIN
class UnicodeSet;
class UStack;
class UVector32;
class DictionaryMatcher;
class ExternalBreakEngine;
/*******************************************************************
* LanguageBreakEngine
*/
/**
* <p>LanguageBreakEngines implement language-specific knowledge for
* finding text boundaries within a run of characters belonging to a
* specific set. The boundaries will be of a specific kind, e.g. word,
* line, etc.</p>
*
* <p>LanguageBreakEngines should normally be implemented so as to
* be shared between threads without locking.</p>
*/
class LanguageBreakEngine : public UObject {
public:
/**
* <p>Default constructor.</p>
*
*/
LanguageBreakEngine();
/**
* <p>Virtual destructor.</p>
*/
virtual ~LanguageBreakEngine();
/**
* <p>Indicate whether this engine handles a particular character for
* a particular kind of break.</p>
*
* @param c A character which begins a run that the engine might handle
* @param locale The locale.
* @return true if this engine handles the particular character and break
* type.
*/
virtual UBool handles(UChar32 c, const char* locale) const = 0;
/**
* <p>Find any breaks within a run in the supplied text.</p>
*
* @param text A UText representing the text. The
* iterator is left at the end of the run of characters which the engine
* is capable of handling.
* @param startPos The start of the run within the supplied text.
* @param endPos The end of the run within the supplied text.
* @param foundBreaks A Vector of int32_t to receive the breaks.
* @param status Information on any errors encountered.
* @return The number of breaks found.
*/
virtual int32_t findBreaks( UText *text,
int32_t startPos,
int32_t endPos,
UVector32 &foundBreaks,
UBool isPhraseBreaking,
UErrorCode &status) const = 0;
};
/*******************************************************************
* BreakEngineWrapper
*/
/**
* <p>BreakEngineWrapper implement LanguageBreakEngine by
* a thin wrapper that delegate the task to ExternalBreakEngine
* </p>
*/
class BreakEngineWrapper : public LanguageBreakEngine {
public:
BreakEngineWrapper(ExternalBreakEngine* engine, UErrorCode &status);
virtual ~BreakEngineWrapper();
virtual UBool handles(UChar32 c, const char* locale) const override;
virtual int32_t findBreaks( UText *text,
int32_t startPos,
int32_t endPos,
UVector32 &foundBreaks,
UBool isPhraseBreaking,
UErrorCode &status) const override;
private:
LocalPointer<ExternalBreakEngine> delegate;
};
/*******************************************************************
* LanguageBreakFactory
*/
/**
* <p>LanguageBreakFactorys find and return a LanguageBreakEngine
* that can determine breaks for characters in a specific set, if
* such an object can be found.</p>
*
* <p>If a LanguageBreakFactory is to be shared between threads,
* appropriate synchronization must be used; there is none internal
* to the factory.</p>
*
* <p>A LanguageBreakEngine returned by a LanguageBreakFactory can
* normally be shared between threads without synchronization, unless
* the specific subclass of LanguageBreakFactory indicates otherwise.</p>
*
* <p>A LanguageBreakFactory is responsible for deleting any LanguageBreakEngine
* it returns when it itself is deleted, unless the specific subclass of
* LanguageBreakFactory indicates otherwise. Naturally, the factory should
* not be deleted until the LanguageBreakEngines it has returned are no
* longer needed.</p>
*/
class LanguageBreakFactory : public UMemory {
public:
/**
* <p>Default constructor.</p>
*
*/
LanguageBreakFactory();
/**
* <p>Virtual destructor.</p>
*/
virtual ~LanguageBreakFactory();
/**
* <p>Find and return a LanguageBreakEngine that can find the desired
* kind of break for the set of characters to which the supplied
* character belongs. It is up to the set of available engines to
* determine what the sets of characters are.</p>
*
* @param c A character that begins a run for which a LanguageBreakEngine is
* sought.
* @param locale The locale.
* @return A LanguageBreakEngine with the desired characteristics, or 0.
*/
virtual const LanguageBreakEngine *getEngineFor(UChar32 c, const char* locale) = 0;
};
/*******************************************************************
* UnhandledEngine
*/
/**
* <p>UnhandledEngine is a special subclass of LanguageBreakEngine that
* handles characters that no other LanguageBreakEngine is available to
* handle. It is told the character and the type of break; at its
* discretion it may handle more than the specified character (e.g.,
* the entire script to which that character belongs.</p>
*
* <p>UnhandledEngines may not be shared between threads without
* external synchronization.</p>
*/
class UnhandledEngine : public LanguageBreakEngine {
private:
/**
* The sets of characters handled.
* @internal
*/
UnicodeSet *fHandled;
public:
/**
* <p>Default constructor.</p>
*
*/
UnhandledEngine(UErrorCode &status);
/**
* <p>Virtual destructor.</p>
*/
virtual ~UnhandledEngine();
/**
* <p>Indicate whether this engine handles a particular character for
* a particular kind of break.</p>
*
* @param c A character which begins a run that the engine might handle
* @param locale The locale.
* @return true if this engine handles the particular character and break
* type.
*/
virtual UBool handles(UChar32 c, const char* locale) const override;
/**
* <p>Find any breaks within a run in the supplied text.</p>
*
* @param text A UText representing the text (TODO: UText). The
* iterator is left at the end of the run of characters which the engine
* is capable of handling.
* @param startPos The start of the run within the supplied text.
* @param endPos The end of the run within the supplied text.
* @param foundBreaks An allocated C array of the breaks found, if any
* @param status Information on any errors encountered.
* @return The number of breaks found.
*/
virtual int32_t findBreaks( UText *text,
int32_t startPos,
int32_t endPos,
UVector32 &foundBreaks,
UBool isPhraseBreaking,
UErrorCode &status) const override;
/**
* <p>Tell the engine to handle a particular character and break type.</p>
*
* @param c A character which the engine should handle
*/
virtual void handleCharacter(UChar32 c);
};
/*******************************************************************
* ICULanguageBreakFactory
*/
/**
* <p>ICULanguageBreakFactory is the default LanguageBreakFactory for
* ICU. It creates dictionary-based LanguageBreakEngines from dictionary
* data in the ICU data file.</p>
*/
class ICULanguageBreakFactory : public LanguageBreakFactory {
private:
/**
* The stack of break engines created by this factory
* @internal
*/
UStack *fEngines;
public:
/**
* <p>Standard constructor.</p>
*
*/
ICULanguageBreakFactory(UErrorCode &status);
/**
* <p>Virtual destructor.</p>
*/
virtual ~ICULanguageBreakFactory();
/**
* <p>Find and return a LanguageBreakEngine that can find the desired
* kind of break for the set of characters to which the supplied
* character belongs. It is up to the set of available engines to
* determine what the sets of characters are.</p>
*
* @param c A character that begins a run for which a LanguageBreakEngine is
* sought.
* @param locale The locale.
* @return A LanguageBreakEngine with the desired characteristics, or 0.
*/
virtual const LanguageBreakEngine *getEngineFor(UChar32 c, const char* locale) override;
/**
* Add and adopt the engine and return an URegistryKey.
* @param engine The ExternalBreakEngine to be added and adopt. The caller
* pass the ownership and should not release the memory after this.
* @param status the error code.
*/
virtual void addExternalEngine(ExternalBreakEngine* engine, UErrorCode& status);
protected:
/**
* <p>Create a LanguageBreakEngine for the set of characters to which
* the supplied character belongs, for the specified break type.</p>
*
* @param c A character that begins a run for which a LanguageBreakEngine is
* sought.
* @param locale The locale.
* @return A LanguageBreakEngine with the desired characteristics, or 0.
*/
virtual const LanguageBreakEngine *loadEngineFor(UChar32 c, const char* locale);
/**
* <p>Create a DictionaryMatcher for the specified script and break type.</p>
* @param script An ISO 15924 script code that identifies the dictionary to be
* created.
* @return A DictionaryMatcher with the desired characteristics, or nullptr.
*/
virtual DictionaryMatcher *loadDictionaryMatcherFor(UScriptCode script);
private:
void ensureEngines(UErrorCode& status);
};
U_NAMESPACE_END
/* BRKENG_H */
#endif

View file

@ -0,0 +1,547 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
* Copyright (C) 1997-2015, International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*
* File brkiter.cpp
*
* Modification History:
*
* Date Name Description
* 02/18/97 aliu Converted from OpenClass. Added DONE.
* 01/13/2000 helena Added UErrorCode parameter to createXXXInstance methods.
*****************************************************************************************
*/
// *****************************************************************************
// This file was generated from the java source file BreakIterator.java
// *****************************************************************************
#include "unicode/utypes.h"
#if !UCONFIG_NO_BREAK_ITERATION
#include "unicode/rbbi.h"
#include "unicode/brkiter.h"
#include "unicode/udata.h"
#include "unicode/uloc.h"
#include "unicode/ures.h"
#include "unicode/ustring.h"
#include "unicode/filteredbrk.h"
#include "bytesinkutil.h"
#include "ucln_cmn.h"
#include "cstring.h"
#include "umutex.h"
#include "servloc.h"
#include "locbased.h"
#include "uresimp.h"
#include "uassert.h"
#include "ubrkimpl.h"
#include "utracimp.h"
#include "charstr.h"
// *****************************************************************************
// class BreakIterator
// This class implements methods for finding the location of boundaries in text.
// Instances of BreakIterator maintain a current position and scan over text
// returning the index of characters where boundaries occur.
// *****************************************************************************
U_NAMESPACE_BEGIN
// -------------------------------------
BreakIterator*
BreakIterator::buildInstance(const Locale& loc, const char *type, UErrorCode &status)
{
char fnbuff[256];
char ext[4]={'\0'};
CharString actualLocale;
int32_t size;
const char16_t* brkfname = nullptr;
UResourceBundle brkRulesStack;
UResourceBundle brkNameStack;
UResourceBundle *brkRules = &brkRulesStack;
UResourceBundle *brkName = &brkNameStack;
RuleBasedBreakIterator *result = nullptr;
if (U_FAILURE(status))
return nullptr;
ures_initStackObject(brkRules);
ures_initStackObject(brkName);
// Get the locale
UResourceBundle *b = ures_openNoDefault(U_ICUDATA_BRKITR, loc.getName(), &status);
// Get the "boundaries" array.
if (U_SUCCESS(status)) {
brkRules = ures_getByKeyWithFallback(b, "boundaries", brkRules, &status);
// Get the string object naming the rules file
brkName = ures_getByKeyWithFallback(brkRules, type, brkName, &status);
// Get the actual string
brkfname = ures_getString(brkName, &size, &status);
U_ASSERT((size_t)size<sizeof(fnbuff));
if ((size_t)size>=sizeof(fnbuff)) {
size=0;
if (U_SUCCESS(status)) {
status = U_BUFFER_OVERFLOW_ERROR;
}
}
// Use the string if we found it
if (U_SUCCESS(status) && brkfname) {
actualLocale.append(ures_getLocaleInternal(brkName, &status), -1, status);
char16_t* extStart=u_strchr(brkfname, 0x002e);
int len = 0;
if (extStart != nullptr){
len = (int)(extStart-brkfname);
u_UCharsToChars(extStart+1, ext, sizeof(ext)); // nul terminates the buff
u_UCharsToChars(brkfname, fnbuff, len);
}
fnbuff[len]=0; // nul terminate
}
}
ures_close(brkRules);
ures_close(brkName);
UDataMemory* file = udata_open(U_ICUDATA_BRKITR, ext, fnbuff, &status);
if (U_FAILURE(status)) {
ures_close(b);
return nullptr;
}
// Create a RuleBasedBreakIterator
result = new RuleBasedBreakIterator(file, uprv_strstr(type, "phrase") != nullptr, status);
// If there is a result, set the valid locale and actual locale, and the kind
if (U_SUCCESS(status) && result != nullptr) {
U_LOCALE_BASED(locBased, *(BreakIterator*)result);
locBased.setLocaleIDs(ures_getLocaleByType(b, ULOC_VALID_LOCALE, &status),
actualLocale.data());
uprv_strncpy(result->requestLocale, loc.getName(), ULOC_FULLNAME_CAPACITY);
result->requestLocale[ULOC_FULLNAME_CAPACITY-1] = 0; // always terminate
}
ures_close(b);
if (U_FAILURE(status) && result != nullptr) { // Sometimes redundant check, but simple
delete result;
return nullptr;
}
if (result == nullptr) {
udata_close(file);
if (U_SUCCESS(status)) {
status = U_MEMORY_ALLOCATION_ERROR;
}
}
return result;
}
// Creates a break iterator for word breaks.
BreakIterator* U_EXPORT2
BreakIterator::createWordInstance(const Locale& key, UErrorCode& status)
{
return createInstance(key, UBRK_WORD, status);
}
// -------------------------------------
// Creates a break iterator for line breaks.
BreakIterator* U_EXPORT2
BreakIterator::createLineInstance(const Locale& key, UErrorCode& status)
{
return createInstance(key, UBRK_LINE, status);
}
// -------------------------------------
// Creates a break iterator for character breaks.
BreakIterator* U_EXPORT2
BreakIterator::createCharacterInstance(const Locale& key, UErrorCode& status)
{
return createInstance(key, UBRK_CHARACTER, status);
}
// -------------------------------------
// Creates a break iterator for sentence breaks.
BreakIterator* U_EXPORT2
BreakIterator::createSentenceInstance(const Locale& key, UErrorCode& status)
{
return createInstance(key, UBRK_SENTENCE, status);
}
// -------------------------------------
// Creates a break iterator for title casing breaks.
BreakIterator* U_EXPORT2
BreakIterator::createTitleInstance(const Locale& key, UErrorCode& status)
{
return createInstance(key, UBRK_TITLE, status);
}
// -------------------------------------
// Gets all the available locales that has localized text boundary data.
const Locale* U_EXPORT2
BreakIterator::getAvailableLocales(int32_t& count)
{
return Locale::getAvailableLocales(count);
}
// ------------------------------------------
//
// Constructors, destructor and assignment operator
//
//-------------------------------------------
BreakIterator::BreakIterator()
{
*validLocale = *actualLocale = *requestLocale = 0;
}
BreakIterator::BreakIterator(const BreakIterator &other) : UObject(other) {
uprv_strncpy(actualLocale, other.actualLocale, sizeof(actualLocale));
uprv_strncpy(validLocale, other.validLocale, sizeof(validLocale));
uprv_strncpy(requestLocale, other.requestLocale, sizeof(requestLocale));
}
BreakIterator &BreakIterator::operator =(const BreakIterator &other) {
if (this != &other) {
uprv_strncpy(actualLocale, other.actualLocale, sizeof(actualLocale));
uprv_strncpy(validLocale, other.validLocale, sizeof(validLocale));
uprv_strncpy(requestLocale, other.requestLocale, sizeof(requestLocale));
}
return *this;
}
BreakIterator::~BreakIterator()
{
}
// ------------------------------------------
//
// Registration
//
//-------------------------------------------
#if !UCONFIG_NO_SERVICE
// -------------------------------------
class ICUBreakIteratorFactory : public ICUResourceBundleFactory {
public:
virtual ~ICUBreakIteratorFactory();
protected:
virtual UObject* handleCreate(const Locale& loc, int32_t kind, const ICUService* /*service*/, UErrorCode& status) const override {
return BreakIterator::makeInstance(loc, kind, status);
}
};
ICUBreakIteratorFactory::~ICUBreakIteratorFactory() {}
// -------------------------------------
class ICUBreakIteratorService : public ICULocaleService {
public:
ICUBreakIteratorService()
: ICULocaleService(UNICODE_STRING("Break Iterator", 14))
{
UErrorCode status = U_ZERO_ERROR;
registerFactory(new ICUBreakIteratorFactory(), status);
}
virtual ~ICUBreakIteratorService();
virtual UObject* cloneInstance(UObject* instance) const override {
return ((BreakIterator*)instance)->clone();
}
virtual UObject* handleDefault(const ICUServiceKey& key, UnicodeString* /*actualID*/, UErrorCode& status) const override {
LocaleKey& lkey = static_cast<LocaleKey&>(const_cast<ICUServiceKey&>(key));
int32_t kind = lkey.kind();
Locale loc;
lkey.currentLocale(loc);
return BreakIterator::makeInstance(loc, kind, status);
}
virtual UBool isDefault() const override {
return countFactories() == 1;
}
};
ICUBreakIteratorService::~ICUBreakIteratorService() {}
// -------------------------------------
// defined in ucln_cmn.h
U_NAMESPACE_END
static icu::UInitOnce gInitOnceBrkiter {};
static icu::ICULocaleService* gService = nullptr;
/**
* Release all static memory held by breakiterator.
*/
U_CDECL_BEGIN
static UBool U_CALLCONV breakiterator_cleanup() {
#if !UCONFIG_NO_SERVICE
if (gService) {
delete gService;
gService = nullptr;
}
gInitOnceBrkiter.reset();
#endif
return true;
}
U_CDECL_END
U_NAMESPACE_BEGIN
static void U_CALLCONV
initService() {
gService = new ICUBreakIteratorService();
ucln_common_registerCleanup(UCLN_COMMON_BREAKITERATOR, breakiterator_cleanup);
}
static ICULocaleService*
getService()
{
umtx_initOnce(gInitOnceBrkiter, &initService);
return gService;
}
// -------------------------------------
static inline UBool
hasService()
{
return !gInitOnceBrkiter.isReset() && getService() != nullptr;
}
// -------------------------------------
URegistryKey U_EXPORT2
BreakIterator::registerInstance(BreakIterator* toAdopt, const Locale& locale, UBreakIteratorType kind, UErrorCode& status)
{
ICULocaleService *service = getService();
if (service == nullptr) {
status = U_MEMORY_ALLOCATION_ERROR;
return nullptr;
}
return service->registerInstance(toAdopt, locale, kind, status);
}
// -------------------------------------
UBool U_EXPORT2
BreakIterator::unregister(URegistryKey key, UErrorCode& status)
{
if (U_SUCCESS(status)) {
if (hasService()) {
return gService->unregister(key, status);
}
status = U_MEMORY_ALLOCATION_ERROR;
}
return false;
}
// -------------------------------------
StringEnumeration* U_EXPORT2
BreakIterator::getAvailableLocales()
{
ICULocaleService *service = getService();
if (service == nullptr) {
return nullptr;
}
return service->getAvailableLocales();
}
#endif /* UCONFIG_NO_SERVICE */
// -------------------------------------
BreakIterator*
BreakIterator::createInstance(const Locale& loc, int32_t kind, UErrorCode& status)
{
if (U_FAILURE(status)) {
return nullptr;
}
#if !UCONFIG_NO_SERVICE
if (hasService()) {
Locale actualLoc("");
BreakIterator *result = (BreakIterator*)gService->get(loc, kind, &actualLoc, status);
// TODO: The way the service code works in ICU 2.8 is that if
// there is a real registered break iterator, the actualLoc
// will be populated, but if the handleDefault path is taken
// (because nothing is registered that can handle the
// requested locale) then the actualLoc comes back empty. In
// that case, the returned object already has its actual/valid
// locale data populated (by makeInstance, which is what
// handleDefault calls), so we don't touch it. YES, A COMMENT
// THIS LONG is a sign of bad code -- so the action item is to
// revisit this in ICU 3.0 and clean it up/fix it/remove it.
if (U_SUCCESS(status) && (result != nullptr) && *actualLoc.getName() != 0) {
U_LOCALE_BASED(locBased, *result);
locBased.setLocaleIDs(actualLoc.getName(), actualLoc.getName());
}
return result;
}
else
#endif
{
return makeInstance(loc, kind, status);
}
}
// -------------------------------------
enum { kKeyValueLenMax = 32 };
BreakIterator*
BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status)
{
if (U_FAILURE(status)) {
return nullptr;
}
BreakIterator *result = nullptr;
switch (kind) {
case UBRK_CHARACTER:
{
UTRACE_ENTRY(UTRACE_UBRK_CREATE_CHARACTER);
result = BreakIterator::buildInstance(loc, "grapheme", status);
UTRACE_EXIT_STATUS(status);
}
break;
case UBRK_WORD:
{
UTRACE_ENTRY(UTRACE_UBRK_CREATE_WORD);
result = BreakIterator::buildInstance(loc, "word", status);
UTRACE_EXIT_STATUS(status);
}
break;
case UBRK_LINE:
{
char lb_lw[kKeyValueLenMax];
UTRACE_ENTRY(UTRACE_UBRK_CREATE_LINE);
uprv_strcpy(lb_lw, "line");
UErrorCode kvStatus = U_ZERO_ERROR;
auto value = loc.getKeywordValue<CharString>("lb", kvStatus);
if (U_SUCCESS(kvStatus) && (value == "strict" || value == "normal" || value == "loose")) {
uprv_strcat(lb_lw, "_");
uprv_strcat(lb_lw, value.data());
}
// lw=phrase is only supported in Japanese and Korean
if (uprv_strcmp(loc.getLanguage(), "ja") == 0 || uprv_strcmp(loc.getLanguage(), "ko") == 0) {
value = loc.getKeywordValue<CharString>("lw", kvStatus);
if (U_SUCCESS(kvStatus) && value == "phrase") {
uprv_strcat(lb_lw, "_");
uprv_strcat(lb_lw, value.data());
}
}
result = BreakIterator::buildInstance(loc, lb_lw, status);
UTRACE_DATA1(UTRACE_INFO, "lb_lw=%s", lb_lw);
UTRACE_EXIT_STATUS(status);
}
break;
case UBRK_SENTENCE:
{
UTRACE_ENTRY(UTRACE_UBRK_CREATE_SENTENCE);
result = BreakIterator::buildInstance(loc, "sentence", status);
#if !UCONFIG_NO_FILTERED_BREAK_ITERATION
char ssKeyValue[kKeyValueLenMax] = {0};
UErrorCode kvStatus = U_ZERO_ERROR;
int32_t kLen = loc.getKeywordValue("ss", ssKeyValue, kKeyValueLenMax, kvStatus);
if (U_SUCCESS(kvStatus) && kLen > 0 && uprv_strcmp(ssKeyValue,"standard")==0) {
FilteredBreakIteratorBuilder* fbiBuilder = FilteredBreakIteratorBuilder::createInstance(loc, kvStatus);
if (U_SUCCESS(kvStatus)) {
result = fbiBuilder->build(result, status);
delete fbiBuilder;
}
}
#endif
UTRACE_EXIT_STATUS(status);
}
break;
case UBRK_TITLE:
{
UTRACE_ENTRY(UTRACE_UBRK_CREATE_TITLE);
result = BreakIterator::buildInstance(loc, "title", status);
UTRACE_EXIT_STATUS(status);
}
break;
default:
status = U_ILLEGAL_ARGUMENT_ERROR;
}
if (U_FAILURE(status)) {
return nullptr;
}
return result;
}
Locale
BreakIterator::getLocale(ULocDataLocaleType type, UErrorCode& status) const {
if (type == ULOC_REQUESTED_LOCALE) {
return {requestLocale};
}
U_LOCALE_BASED(locBased, *this);
return locBased.getLocale(type, status);
}
const char *
BreakIterator::getLocaleID(ULocDataLocaleType type, UErrorCode& status) const {
if (type == ULOC_REQUESTED_LOCALE) {
return requestLocale;
}
U_LOCALE_BASED(locBased, *this);
return locBased.getLocaleID(type, status);
}
// This implementation of getRuleStatus is a do-nothing stub, here to
// provide a default implementation for any derived BreakIterator classes that
// do not implement it themselves.
int32_t BreakIterator::getRuleStatus() const {
return 0;
}
// This implementation of getRuleStatusVec is a do-nothing stub, here to
// provide a default implementation for any derived BreakIterator classes that
// do not implement it themselves.
int32_t BreakIterator::getRuleStatusVec(int32_t *fillInVec, int32_t capacity, UErrorCode &status) {
if (U_FAILURE(status)) {
return 0;
}
if (capacity < 1) {
status = U_BUFFER_OVERFLOW_ERROR;
return 1;
}
*fillInVec = 0;
return 1;
}
BreakIterator::BreakIterator (const Locale& valid, const Locale& actual) {
U_LOCALE_BASED(locBased, (*this));
locBased.setLocaleIDs(valid, actual);
}
U_NAMESPACE_END
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
//eof

View file

@ -0,0 +1,161 @@
// © 2017 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
// bytesinkutil.cpp
// created: 2017sep14 Markus W. Scherer
#include "unicode/utypes.h"
#include "unicode/bytestream.h"
#include "unicode/edits.h"
#include "unicode/stringoptions.h"
#include "unicode/utf8.h"
#include "unicode/utf16.h"
#include "bytesinkutil.h"
#include "charstr.h"
#include "cmemory.h"
#include "uassert.h"
U_NAMESPACE_BEGIN
UBool
ByteSinkUtil::appendChange(int32_t length, const char16_t *s16, int32_t s16Length,
ByteSink &sink, Edits *edits, UErrorCode &errorCode) {
if (U_FAILURE(errorCode)) { return false; }
char scratch[200];
int32_t s8Length = 0;
for (int32_t i = 0; i < s16Length;) {
int32_t capacity;
int32_t desiredCapacity = s16Length - i;
if (desiredCapacity < (INT32_MAX / 3)) {
desiredCapacity *= 3; // max 3 UTF-8 bytes per UTF-16 code unit
} else if (desiredCapacity < (INT32_MAX / 2)) {
desiredCapacity *= 2;
} else {
desiredCapacity = INT32_MAX;
}
char *buffer = sink.GetAppendBuffer(U8_MAX_LENGTH, desiredCapacity,
scratch, UPRV_LENGTHOF(scratch), &capacity);
capacity -= U8_MAX_LENGTH - 1;
int32_t j = 0;
for (; i < s16Length && j < capacity;) {
UChar32 c;
U16_NEXT_UNSAFE(s16, i, c);
U8_APPEND_UNSAFE(buffer, j, c);
}
if (j > (INT32_MAX - s8Length)) {
errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
return false;
}
sink.Append(buffer, j);
s8Length += j;
}
if (edits != nullptr) {
edits->addReplace(length, s8Length);
}
return true;
}
UBool
ByteSinkUtil::appendChange(const uint8_t *s, const uint8_t *limit,
const char16_t *s16, int32_t s16Length,
ByteSink &sink, Edits *edits, UErrorCode &errorCode) {
if (U_FAILURE(errorCode)) { return false; }
if ((limit - s) > INT32_MAX) {
errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
return false;
}
return appendChange((int32_t)(limit - s), s16, s16Length, sink, edits, errorCode);
}
void
ByteSinkUtil::appendCodePoint(int32_t length, UChar32 c, ByteSink &sink, Edits *edits) {
char s8[U8_MAX_LENGTH];
int32_t s8Length = 0;
U8_APPEND_UNSAFE(s8, s8Length, c);
if (edits != nullptr) {
edits->addReplace(length, s8Length);
}
sink.Append(s8, s8Length);
}
namespace {
// See unicode/utf8.h U8_APPEND_UNSAFE().
inline uint8_t getTwoByteLead(UChar32 c) { return (uint8_t)((c >> 6) | 0xc0); }
inline uint8_t getTwoByteTrail(UChar32 c) { return (uint8_t)((c & 0x3f) | 0x80); }
} // namespace
void
ByteSinkUtil::appendTwoBytes(UChar32 c, ByteSink &sink) {
U_ASSERT(0x80 <= c && c <= 0x7ff); // 2-byte UTF-8
char s8[2] = { (char)getTwoByteLead(c), (char)getTwoByteTrail(c) };
sink.Append(s8, 2);
}
void
ByteSinkUtil::appendNonEmptyUnchanged(const uint8_t *s, int32_t length,
ByteSink &sink, uint32_t options, Edits *edits) {
U_ASSERT(length > 0);
if (edits != nullptr) {
edits->addUnchanged(length);
}
if ((options & U_OMIT_UNCHANGED_TEXT) == 0) {
sink.Append(reinterpret_cast<const char *>(s), length);
}
}
UBool
ByteSinkUtil::appendUnchanged(const uint8_t *s, const uint8_t *limit,
ByteSink &sink, uint32_t options, Edits *edits,
UErrorCode &errorCode) {
if (U_FAILURE(errorCode)) { return false; }
if ((limit - s) > INT32_MAX) {
errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
return false;
}
int32_t length = (int32_t)(limit - s);
if (length > 0) {
appendNonEmptyUnchanged(s, length, sink, options, edits);
}
return true;
}
CharStringByteSink::CharStringByteSink(CharString* dest) : dest_(*dest) {
}
CharStringByteSink::~CharStringByteSink() = default;
void
CharStringByteSink::Append(const char* bytes, int32_t n) {
UErrorCode status = U_ZERO_ERROR;
dest_.append(bytes, n, status);
// Any errors are silently ignored.
}
char*
CharStringByteSink::GetAppendBuffer(int32_t min_capacity,
int32_t desired_capacity_hint,
char* scratch,
int32_t scratch_capacity,
int32_t* result_capacity) {
if (min_capacity < 1 || scratch_capacity < min_capacity) {
*result_capacity = 0;
return nullptr;
}
UErrorCode status = U_ZERO_ERROR;
char* result = dest_.getAppendBuffer(
min_capacity,
desired_capacity_hint,
*result_capacity,
status);
if (U_SUCCESS(status)) {
return result;
}
*result_capacity = scratch_capacity;
return scratch;
}
U_NAMESPACE_END

View file

@ -0,0 +1,156 @@
// © 2017 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
// bytesinkutil.h
// created: 2017sep14 Markus W. Scherer
#ifndef BYTESINKUTIL_H
#define BYTESINKUTIL_H
#include <type_traits>
#include "unicode/utypes.h"
#include "unicode/bytestream.h"
#include "unicode/edits.h"
#include "charstr.h"
#include "cmemory.h"
#include "uassert.h"
#include "ustr_imp.h"
U_NAMESPACE_BEGIN
class ByteSink;
class Edits;
class U_COMMON_API CharStringByteSink : public ByteSink {
public:
CharStringByteSink(CharString* dest);
~CharStringByteSink() override;
CharStringByteSink() = delete;
CharStringByteSink(const CharStringByteSink&) = delete;
CharStringByteSink& operator=(const CharStringByteSink&) = delete;
void Append(const char* bytes, int32_t n) override;
char* GetAppendBuffer(int32_t min_capacity,
int32_t desired_capacity_hint,
char* scratch,
int32_t scratch_capacity,
int32_t* result_capacity) override;
private:
CharString& dest_;
};
// CharString doesn't provide the public API that StringByteSink requires a
// string class to have so this template specialization replaces the default
// implementation of StringByteSink<CharString> with CharStringByteSink.
template<>
class StringByteSink<CharString> : public CharStringByteSink {
public:
StringByteSink(CharString* dest) : CharStringByteSink(dest) { }
StringByteSink(CharString* dest, int32_t /*initialAppendCapacity*/) : CharStringByteSink(dest) { }
};
class U_COMMON_API ByteSinkUtil {
public:
ByteSinkUtil() = delete; // all static
/** (length) bytes were mapped to valid (s16, s16Length). */
static UBool appendChange(int32_t length,
const char16_t *s16, int32_t s16Length,
ByteSink &sink, Edits *edits, UErrorCode &errorCode);
/** The bytes at [s, limit[ were mapped to valid (s16, s16Length). */
static UBool appendChange(const uint8_t *s, const uint8_t *limit,
const char16_t *s16, int32_t s16Length,
ByteSink &sink, Edits *edits, UErrorCode &errorCode);
/** (length) bytes were mapped/changed to valid code point c. */
static void appendCodePoint(int32_t length, UChar32 c, ByteSink &sink, Edits *edits = nullptr);
/** The few bytes at [src, nextSrc[ were mapped/changed to valid code point c. */
static inline void appendCodePoint(const uint8_t *src, const uint8_t *nextSrc, UChar32 c,
ByteSink &sink, Edits *edits = nullptr) {
appendCodePoint((int32_t)(nextSrc - src), c, sink, edits);
}
/** Append the two-byte character (U+0080..U+07FF). */
static void appendTwoBytes(UChar32 c, ByteSink &sink);
static UBool appendUnchanged(const uint8_t *s, int32_t length,
ByteSink &sink, uint32_t options, Edits *edits,
UErrorCode &errorCode) {
if (U_FAILURE(errorCode)) { return false; }
if (length > 0) { appendNonEmptyUnchanged(s, length, sink, options, edits); }
return true;
}
static UBool appendUnchanged(const uint8_t *s, const uint8_t *limit,
ByteSink &sink, uint32_t options, Edits *edits,
UErrorCode &errorCode);
/**
* Calls a lambda that writes to a ByteSink with a CheckedArrayByteSink
* and then returns through u_terminateChars(), in order to implement
* the classic ICU4C C API writing to a fix sized buffer on top of a
* contemporary C++ API.
*
* @param buffer receiving buffer
* @param capacity capacity of receiving buffer
* @param lambda that gets called with the sink as an argument
* @param status set to U_BUFFER_OVERFLOW_ERROR on overflow
* @return number of bytes written, or needed (in case of overflow)
* @internal
*/
template <typename F,
typename = std::enable_if_t<
std::is_invocable_r_v<void, F, ByteSink&, UErrorCode&>>>
static int32_t viaByteSinkToTerminatedChars(char* buffer, int32_t capacity,
F&& lambda,
UErrorCode& status) {
if (U_FAILURE(status)) { return 0; }
CheckedArrayByteSink sink(buffer, capacity);
lambda(sink, status);
if (U_FAILURE(status)) { return 0; }
int32_t reslen = sink.NumberOfBytesAppended();
if (sink.Overflowed()) {
status = U_BUFFER_OVERFLOW_ERROR;
return reslen;
}
return u_terminateChars(buffer, capacity, reslen, &status);
}
/**
* Calls a lambda that writes to a ByteSink with a CharStringByteSink and
* then returns a CharString, in order to implement a contemporary C++ API
* on top of a C/C++ compatibility ByteSink API.
*
* @param lambda that gets called with the sink as an argument
* @param status to check and report
* @return the resulting string, or an empty string (in case of error)
* @internal
*/
template <typename F,
typename = std::enable_if_t<
std::is_invocable_r_v<void, F, ByteSink&, UErrorCode&>>>
static CharString viaByteSinkToCharString(F&& lambda, UErrorCode& status) {
if (U_FAILURE(status)) { return {}; }
CharString result;
CharStringByteSink sink(&result);
lambda(sink, status);
return result;
}
private:
static void appendNonEmptyUnchanged(const uint8_t *s, int32_t length,
ByteSink &sink, uint32_t options, Edits *edits);
};
U_NAMESPACE_END
#endif //BYTESINKUTIL_H

View file

@ -0,0 +1,85 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
// Copyright (C) 2009-2011, International Business Machines
// Corporation and others. All Rights Reserved.
//
// Copyright 2007 Google Inc. All Rights Reserved.
// Author: sanjay@google.com (Sanjay Ghemawat)
#include "unicode/utypes.h"
#include "unicode/bytestream.h"
#include "cmemory.h"
U_NAMESPACE_BEGIN
ByteSink::~ByteSink() {}
char* ByteSink::GetAppendBuffer(int32_t min_capacity,
int32_t /*desired_capacity_hint*/,
char* scratch, int32_t scratch_capacity,
int32_t* result_capacity) {
if (min_capacity < 1 || scratch_capacity < min_capacity) {
*result_capacity = 0;
return nullptr;
}
*result_capacity = scratch_capacity;
return scratch;
}
void ByteSink::Flush() {}
CheckedArrayByteSink::CheckedArrayByteSink(char* outbuf, int32_t capacity)
: outbuf_(outbuf), capacity_(capacity < 0 ? 0 : capacity),
size_(0), appended_(0), overflowed_(false) {
}
CheckedArrayByteSink::~CheckedArrayByteSink() {}
CheckedArrayByteSink& CheckedArrayByteSink::Reset() {
size_ = appended_ = 0;
overflowed_ = false;
return *this;
}
void CheckedArrayByteSink::Append(const char* bytes, int32_t n) {
if (n <= 0) {
return;
}
if (n > (INT32_MAX - appended_)) {
// TODO: Report as integer overflow, not merely buffer overflow.
appended_ = INT32_MAX;
overflowed_ = true;
return;
}
appended_ += n;
int32_t available = capacity_ - size_;
if (n > available) {
n = available;
overflowed_ = true;
}
if (n > 0 && bytes != (outbuf_ + size_)) {
uprv_memcpy(outbuf_ + size_, bytes, n);
}
size_ += n;
}
char* CheckedArrayByteSink::GetAppendBuffer(int32_t min_capacity,
int32_t /*desired_capacity_hint*/,
char* scratch,
int32_t scratch_capacity,
int32_t* result_capacity) {
if (min_capacity < 1 || scratch_capacity < min_capacity) {
*result_capacity = 0;
return nullptr;
}
int32_t available = capacity_ - size_;
if (available >= min_capacity) {
*result_capacity = available;
return outbuf_ + size_;
} else {
*result_capacity = scratch_capacity;
return scratch;
}
}
U_NAMESPACE_END

View file

@ -0,0 +1,441 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
* Copyright (C) 2010-2011, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* file name: bytestrie.cpp
* encoding: UTF-8
* tab size: 8 (not used)
* indentation:4
*
* created on: 2010sep25
* created by: Markus W. Scherer
*/
#include "unicode/utypes.h"
#include "unicode/bytestream.h"
#include "unicode/bytestrie.h"
#include "unicode/uobject.h"
#include "cmemory.h"
#include "uassert.h"
U_NAMESPACE_BEGIN
BytesTrie::~BytesTrie() {
uprv_free(ownedArray_);
}
// lead byte already shifted right by 1.
int32_t
BytesTrie::readValue(const uint8_t *pos, int32_t leadByte) {
int32_t value;
if(leadByte<kMinTwoByteValueLead) {
value=leadByte-kMinOneByteValueLead;
} else if(leadByte<kMinThreeByteValueLead) {
value=((leadByte-kMinTwoByteValueLead)<<8)|*pos;
} else if(leadByte<kFourByteValueLead) {
value=((leadByte-kMinThreeByteValueLead)<<16)|(pos[0]<<8)|pos[1];
} else if(leadByte==kFourByteValueLead) {
value=(pos[0]<<16)|(pos[1]<<8)|pos[2];
} else {
value=(pos[0]<<24)|(pos[1]<<16)|(pos[2]<<8)|pos[3];
}
return value;
}
const uint8_t *
BytesTrie::jumpByDelta(const uint8_t *pos) {
int32_t delta=*pos++;
if(delta<kMinTwoByteDeltaLead) {
// nothing to do
} else if(delta<kMinThreeByteDeltaLead) {
delta=((delta-kMinTwoByteDeltaLead)<<8)|*pos++;
} else if(delta<kFourByteDeltaLead) {
delta=((delta-kMinThreeByteDeltaLead)<<16)|(pos[0]<<8)|pos[1];
pos+=2;
} else if(delta==kFourByteDeltaLead) {
delta=(pos[0]<<16)|(pos[1]<<8)|pos[2];
pos+=3;
} else {
delta=(pos[0]<<24)|(pos[1]<<16)|(pos[2]<<8)|pos[3];
pos+=4;
}
return pos+delta;
}
UStringTrieResult
BytesTrie::current() const {
const uint8_t *pos=pos_;
if(pos==nullptr) {
return USTRINGTRIE_NO_MATCH;
} else {
int32_t node;
return (remainingMatchLength_<0 && (node=*pos)>=kMinValueLead) ?
valueResult(node) : USTRINGTRIE_NO_VALUE;
}
}
UStringTrieResult
BytesTrie::branchNext(const uint8_t *pos, int32_t length, int32_t inByte) {
// Branch according to the current byte.
if(length==0) {
length=*pos++;
}
++length;
// The length of the branch is the number of bytes to select from.
// The data structure encodes a binary search.
while(length>kMaxBranchLinearSubNodeLength) {
if(inByte<*pos++) {
length>>=1;
pos=jumpByDelta(pos);
} else {
length=length-(length>>1);
pos=skipDelta(pos);
}
}
// Drop down to linear search for the last few bytes.
// length>=2 because the loop body above sees length>kMaxBranchLinearSubNodeLength>=3
// and divides length by 2.
do {
if(inByte==*pos++) {
UStringTrieResult result;
int32_t node=*pos;
U_ASSERT(node>=kMinValueLead);
if(node&kValueIsFinal) {
// Leave the final value for getValue() to read.
result=USTRINGTRIE_FINAL_VALUE;
} else {
// Use the non-final value as the jump delta.
++pos;
// int32_t delta=readValue(pos, node>>1);
node>>=1;
int32_t delta;
if(node<kMinTwoByteValueLead) {
delta=node-kMinOneByteValueLead;
} else if(node<kMinThreeByteValueLead) {
delta=((node-kMinTwoByteValueLead)<<8)|*pos++;
} else if(node<kFourByteValueLead) {
delta=((node-kMinThreeByteValueLead)<<16)|(pos[0]<<8)|pos[1];
pos+=2;
} else if(node==kFourByteValueLead) {
delta=(pos[0]<<16)|(pos[1]<<8)|pos[2];
pos+=3;
} else {
delta=(pos[0]<<24)|(pos[1]<<16)|(pos[2]<<8)|pos[3];
pos+=4;
}
// end readValue()
pos+=delta;
node=*pos;
result= node>=kMinValueLead ? valueResult(node) : USTRINGTRIE_NO_VALUE;
}
pos_=pos;
return result;
}
--length;
pos=skipValue(pos);
} while(length>1);
if(inByte==*pos++) {
pos_=pos;
int32_t node=*pos;
return node>=kMinValueLead ? valueResult(node) : USTRINGTRIE_NO_VALUE;
} else {
stop();
return USTRINGTRIE_NO_MATCH;
}
}
UStringTrieResult
BytesTrie::nextImpl(const uint8_t *pos, int32_t inByte) {
for(;;) {
int32_t node=*pos++;
if(node<kMinLinearMatch) {
return branchNext(pos, node, inByte);
} else if(node<kMinValueLead) {
// Match the first of length+1 bytes.
int32_t length=node-kMinLinearMatch; // Actual match length minus 1.
if(inByte==*pos++) {
remainingMatchLength_=--length;
pos_=pos;
return (length<0 && (node=*pos)>=kMinValueLead) ?
valueResult(node) : USTRINGTRIE_NO_VALUE;
} else {
// No match.
break;
}
} else if(node&kValueIsFinal) {
// No further matching bytes.
break;
} else {
// Skip intermediate value.
pos=skipValue(pos, node);
// The next node must not also be a value node.
U_ASSERT(*pos<kMinValueLead);
}
}
stop();
return USTRINGTRIE_NO_MATCH;
}
UStringTrieResult
BytesTrie::next(int32_t inByte) {
const uint8_t *pos=pos_;
if(pos==nullptr) {
return USTRINGTRIE_NO_MATCH;
}
if(inByte<0) {
inByte+=0x100;
}
int32_t length=remainingMatchLength_; // Actual remaining match length minus 1.
if(length>=0) {
// Remaining part of a linear-match node.
if(inByte==*pos++) {
remainingMatchLength_=--length;
pos_=pos;
int32_t node;
return (length<0 && (node=*pos)>=kMinValueLead) ?
valueResult(node) : USTRINGTRIE_NO_VALUE;
} else {
stop();
return USTRINGTRIE_NO_MATCH;
}
}
return nextImpl(pos, inByte);
}
UStringTrieResult
BytesTrie::next(const char *s, int32_t sLength) {
if(sLength<0 ? *s==0 : sLength==0) {
// Empty input.
return current();
}
const uint8_t *pos=pos_;
if(pos==nullptr) {
return USTRINGTRIE_NO_MATCH;
}
int32_t length=remainingMatchLength_; // Actual remaining match length minus 1.
for(;;) {
// Fetch the next input byte, if there is one.
// Continue a linear-match node without rechecking sLength<0.
int32_t inByte;
if(sLength<0) {
for(;;) {
if((inByte=*s++)==0) {
remainingMatchLength_=length;
pos_=pos;
int32_t node;
return (length<0 && (node=*pos)>=kMinValueLead) ?
valueResult(node) : USTRINGTRIE_NO_VALUE;
}
if(length<0) {
remainingMatchLength_=length;
break;
}
if(inByte!=*pos) {
stop();
return USTRINGTRIE_NO_MATCH;
}
++pos;
--length;
}
} else {
for(;;) {
if(sLength==0) {
remainingMatchLength_=length;
pos_=pos;
int32_t node;
return (length<0 && (node=*pos)>=kMinValueLead) ?
valueResult(node) : USTRINGTRIE_NO_VALUE;
}
inByte=*s++;
--sLength;
if(length<0) {
remainingMatchLength_=length;
break;
}
if(inByte!=*pos) {
stop();
return USTRINGTRIE_NO_MATCH;
}
++pos;
--length;
}
}
for(;;) {
int32_t node=*pos++;
if(node<kMinLinearMatch) {
UStringTrieResult result=branchNext(pos, node, inByte);
if(result==USTRINGTRIE_NO_MATCH) {
return USTRINGTRIE_NO_MATCH;
}
// Fetch the next input byte, if there is one.
if(sLength<0) {
if((inByte=*s++)==0) {
return result;
}
} else {
if(sLength==0) {
return result;
}
inByte=*s++;
--sLength;
}
if(result==USTRINGTRIE_FINAL_VALUE) {
// No further matching bytes.
stop();
return USTRINGTRIE_NO_MATCH;
}
pos=pos_; // branchNext() advanced pos and wrote it to pos_ .
} else if(node<kMinValueLead) {
// Match length+1 bytes.
length=node-kMinLinearMatch; // Actual match length minus 1.
if(inByte!=*pos) {
stop();
return USTRINGTRIE_NO_MATCH;
}
++pos;
--length;
break;
} else if(node&kValueIsFinal) {
// No further matching bytes.
stop();
return USTRINGTRIE_NO_MATCH;
} else {
// Skip intermediate value.
pos=skipValue(pos, node);
// The next node must not also be a value node.
U_ASSERT(*pos<kMinValueLead);
}
}
}
}
const uint8_t *
BytesTrie::findUniqueValueFromBranch(const uint8_t *pos, int32_t length,
UBool haveUniqueValue, int32_t &uniqueValue) {
while(length>kMaxBranchLinearSubNodeLength) {
++pos; // ignore the comparison byte
if(nullptr==findUniqueValueFromBranch(jumpByDelta(pos), length>>1, haveUniqueValue, uniqueValue)) {
return nullptr;
}
length=length-(length>>1);
pos=skipDelta(pos);
}
do {
++pos; // ignore a comparison byte
// handle its value
int32_t node=*pos++;
UBool isFinal=(UBool)(node&kValueIsFinal);
int32_t value=readValue(pos, node>>1);
pos=skipValue(pos, node);
if(isFinal) {
if(haveUniqueValue) {
if(value!=uniqueValue) {
return nullptr;
}
} else {
uniqueValue=value;
haveUniqueValue=true;
}
} else {
if(!findUniqueValue(pos+value, haveUniqueValue, uniqueValue)) {
return nullptr;
}
haveUniqueValue=true;
}
} while(--length>1);
return pos+1; // ignore the last comparison byte
}
UBool
BytesTrie::findUniqueValue(const uint8_t *pos, UBool haveUniqueValue, int32_t &uniqueValue) {
for(;;) {
int32_t node=*pos++;
if(node<kMinLinearMatch) {
if(node==0) {
node=*pos++;
}
pos=findUniqueValueFromBranch(pos, node+1, haveUniqueValue, uniqueValue);
if(pos==nullptr) {
return false;
}
haveUniqueValue=true;
} else if(node<kMinValueLead) {
// linear-match node
pos+=node-kMinLinearMatch+1; // Ignore the match bytes.
} else {
UBool isFinal=(UBool)(node&kValueIsFinal);
int32_t value=readValue(pos, node>>1);
if(haveUniqueValue) {
if(value!=uniqueValue) {
return false;
}
} else {
uniqueValue=value;
haveUniqueValue=true;
}
if(isFinal) {
return true;
}
pos=skipValue(pos, node);
}
}
}
int32_t
BytesTrie::getNextBytes(ByteSink &out) const {
const uint8_t *pos=pos_;
if(pos==nullptr) {
return 0;
}
if(remainingMatchLength_>=0) {
append(out, *pos); // Next byte of a pending linear-match node.
return 1;
}
int32_t node=*pos++;
if(node>=kMinValueLead) {
if(node&kValueIsFinal) {
return 0;
} else {
pos=skipValue(pos, node);
node=*pos++;
U_ASSERT(node<kMinValueLead);
}
}
if(node<kMinLinearMatch) {
if(node==0) {
node=*pos++;
}
getNextBranchBytes(pos, ++node, out);
return node;
} else {
// First byte of the linear-match node.
append(out, *pos);
return 1;
}
}
void
BytesTrie::getNextBranchBytes(const uint8_t *pos, int32_t length, ByteSink &out) {
while(length>kMaxBranchLinearSubNodeLength) {
++pos; // ignore the comparison byte
getNextBranchBytes(jumpByDelta(pos), length>>1, out);
length=length-(length>>1);
pos=skipDelta(pos);
}
do {
append(out, *pos++);
pos=skipValue(pos);
} while(--length>1);
append(out, *pos);
}
void
BytesTrie::append(ByteSink &out, int c) {
char ch=(char)c;
out.Append(&ch, 1);
}
U_NAMESPACE_END

View file

@ -0,0 +1,512 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
* Copyright (C) 2010-2012, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* file name: bytestriebuilder.cpp
* encoding: UTF-8
* tab size: 8 (not used)
* indentation:4
*
* created on: 2010sep25
* created by: Markus W. Scherer
*/
#include "unicode/utypes.h"
#include "unicode/bytestrie.h"
#include "unicode/bytestriebuilder.h"
#include "unicode/stringpiece.h"
#include "charstr.h"
#include "cmemory.h"
#include "uhash.h"
#include "uarrsort.h"
#include "uassert.h"
#include "ustr_imp.h"
U_NAMESPACE_BEGIN
/*
* Note: This builder implementation stores (bytes, value) pairs with full copies
* of the byte sequences, until the BytesTrie is built.
* It might(!) take less memory if we collected the data in a temporary, dynamic trie.
*/
class BytesTrieElement : public UMemory {
public:
// Use compiler's default constructor, initializes nothing.
void setTo(StringPiece s, int32_t val, CharString &strings, UErrorCode &errorCode);
StringPiece getString(const CharString &strings) const {
int32_t offset=stringOffset;
int32_t length;
if(offset>=0) {
length=(uint8_t)strings[offset++];
} else {
offset=~offset;
length=((int32_t)(uint8_t)strings[offset]<<8)|(uint8_t)strings[offset+1];
offset+=2;
}
return StringPiece(strings.data()+offset, length);
}
int32_t getStringLength(const CharString &strings) const {
int32_t offset=stringOffset;
if(offset>=0) {
return (uint8_t)strings[offset];
} else {
offset=~offset;
return ((int32_t)(uint8_t)strings[offset]<<8)|(uint8_t)strings[offset+1];
}
}
char charAt(int32_t index, const CharString &strings) const { return data(strings)[index]; }
int32_t getValue() const { return value; }
int32_t compareStringTo(const BytesTrieElement &o, const CharString &strings) const;
private:
const char *data(const CharString &strings) const {
int32_t offset=stringOffset;
if(offset>=0) {
++offset;
} else {
offset=~offset+2;
}
return strings.data()+offset;
}
// If the stringOffset is non-negative, then the first strings byte contains
// the string length.
// If the stringOffset is negative, then the first two strings bytes contain
// the string length (big-endian), and the offset needs to be bit-inverted.
// (Compared with a stringLength field here, this saves 3 bytes per string for most strings.)
int32_t stringOffset;
int32_t value;
};
void
BytesTrieElement::setTo(StringPiece s, int32_t val,
CharString &strings, UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) {
return;
}
int32_t length=s.length();
if(length>0xffff) {
// Too long: We store the length in 1 or 2 bytes.
errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
return;
}
int32_t offset=strings.length();
if(length>0xff) {
offset=~offset;
strings.append((char)(length>>8), errorCode);
}
strings.append((char)length, errorCode);
stringOffset=offset;
value=val;
strings.append(s, errorCode);
}
int32_t
BytesTrieElement::compareStringTo(const BytesTrieElement &other, const CharString &strings) const {
// TODO: add StringPiece::compare(), see ticket #8187
StringPiece thisString=getString(strings);
StringPiece otherString=other.getString(strings);
int32_t lengthDiff=thisString.length()-otherString.length();
int32_t commonLength;
if(lengthDiff<=0) {
commonLength=thisString.length();
} else {
commonLength=otherString.length();
}
int32_t diff=uprv_memcmp(thisString.data(), otherString.data(), commonLength);
return diff!=0 ? diff : lengthDiff;
}
BytesTrieBuilder::BytesTrieBuilder(UErrorCode &errorCode)
: strings(nullptr), elements(nullptr), elementsCapacity(0), elementsLength(0),
bytes(nullptr), bytesCapacity(0), bytesLength(0) {
if(U_FAILURE(errorCode)) {
return;
}
strings=new CharString();
if(strings==nullptr) {
errorCode=U_MEMORY_ALLOCATION_ERROR;
}
}
BytesTrieBuilder::~BytesTrieBuilder() {
delete strings;
delete[] elements;
uprv_free(bytes);
}
BytesTrieBuilder &
BytesTrieBuilder::add(StringPiece s, int32_t value, UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) {
return *this;
}
if(bytesLength>0) {
// Cannot add elements after building.
errorCode=U_NO_WRITE_PERMISSION;
return *this;
}
if(elementsLength==elementsCapacity) {
int32_t newCapacity;
if(elementsCapacity==0) {
newCapacity=1024;
} else {
newCapacity=4*elementsCapacity;
}
BytesTrieElement *newElements=new BytesTrieElement[newCapacity];
if(newElements==nullptr) {
errorCode=U_MEMORY_ALLOCATION_ERROR;
return *this; // error instead of dereferencing null
}
if(elementsLength>0) {
uprv_memcpy(newElements, elements, (size_t)elementsLength*sizeof(BytesTrieElement));
}
delete[] elements;
elements=newElements;
elementsCapacity=newCapacity;
}
elements[elementsLength++].setTo(s, value, *strings, errorCode);
return *this;
}
U_CDECL_BEGIN
static int32_t U_CALLCONV
compareElementStrings(const void *context, const void *left, const void *right) {
const CharString *strings=static_cast<const CharString *>(context);
const BytesTrieElement *leftElement=static_cast<const BytesTrieElement *>(left);
const BytesTrieElement *rightElement=static_cast<const BytesTrieElement *>(right);
return leftElement->compareStringTo(*rightElement, *strings);
}
U_CDECL_END
BytesTrie *
BytesTrieBuilder::build(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
buildBytes(buildOption, errorCode);
BytesTrie *newTrie=nullptr;
if(U_SUCCESS(errorCode)) {
newTrie=new BytesTrie(bytes, bytes+(bytesCapacity-bytesLength));
if(newTrie==nullptr) {
errorCode=U_MEMORY_ALLOCATION_ERROR;
} else {
bytes=nullptr; // The new trie now owns the array.
bytesCapacity=0;
}
}
return newTrie;
}
StringPiece
BytesTrieBuilder::buildStringPiece(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
buildBytes(buildOption, errorCode);
StringPiece result;
if(U_SUCCESS(errorCode)) {
result.set(bytes+(bytesCapacity-bytesLength), bytesLength);
}
return result;
}
void
BytesTrieBuilder::buildBytes(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) {
return;
}
if(bytes!=nullptr && bytesLength>0) {
// Already built.
return;
}
if(bytesLength==0) {
if(elementsLength==0) {
errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
return;
}
uprv_sortArray(elements, elementsLength, (int32_t)sizeof(BytesTrieElement),
compareElementStrings, strings,
false, // need not be a stable sort
&errorCode);
if(U_FAILURE(errorCode)) {
return;
}
// Duplicate strings are not allowed.
StringPiece prev=elements[0].getString(*strings);
for(int32_t i=1; i<elementsLength; ++i) {
StringPiece current=elements[i].getString(*strings);
if(prev==current) {
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
return;
}
prev=current;
}
}
// Create and byte-serialize the trie for the elements.
bytesLength=0;
int32_t capacity=strings->length();
if(capacity<1024) {
capacity=1024;
}
if(bytesCapacity<capacity) {
uprv_free(bytes);
bytes=static_cast<char *>(uprv_malloc(capacity));
if(bytes==nullptr) {
errorCode=U_MEMORY_ALLOCATION_ERROR;
bytesCapacity=0;
return;
}
bytesCapacity=capacity;
}
StringTrieBuilder::build(buildOption, elementsLength, errorCode);
if(bytes==nullptr) {
errorCode=U_MEMORY_ALLOCATION_ERROR;
}
}
BytesTrieBuilder &
BytesTrieBuilder::clear() {
strings->clear();
elementsLength=0;
bytesLength=0;
return *this;
}
int32_t
BytesTrieBuilder::getElementStringLength(int32_t i) const {
return elements[i].getStringLength(*strings);
}
char16_t
BytesTrieBuilder::getElementUnit(int32_t i, int32_t byteIndex) const {
return (uint8_t)elements[i].charAt(byteIndex, *strings);
}
int32_t
BytesTrieBuilder::getElementValue(int32_t i) const {
return elements[i].getValue();
}
int32_t
BytesTrieBuilder::getLimitOfLinearMatch(int32_t first, int32_t last, int32_t byteIndex) const {
const BytesTrieElement &firstElement=elements[first];
const BytesTrieElement &lastElement=elements[last];
int32_t minStringLength=firstElement.getStringLength(*strings);
while(++byteIndex<minStringLength &&
firstElement.charAt(byteIndex, *strings)==
lastElement.charAt(byteIndex, *strings)) {}
return byteIndex;
}
int32_t
BytesTrieBuilder::countElementUnits(int32_t start, int32_t limit, int32_t byteIndex) const {
int32_t length=0; // Number of different bytes at byteIndex.
int32_t i=start;
do {
char byte=elements[i++].charAt(byteIndex, *strings);
while(i<limit && byte==elements[i].charAt(byteIndex, *strings)) {
++i;
}
++length;
} while(i<limit);
return length;
}
int32_t
BytesTrieBuilder::skipElementsBySomeUnits(int32_t i, int32_t byteIndex, int32_t count) const {
do {
char byte=elements[i++].charAt(byteIndex, *strings);
while(byte==elements[i].charAt(byteIndex, *strings)) {
++i;
}
} while(--count>0);
return i;
}
int32_t
BytesTrieBuilder::indexOfElementWithNextUnit(int32_t i, int32_t byteIndex, char16_t byte) const {
char b=(char)byte;
while(b==elements[i].charAt(byteIndex, *strings)) {
++i;
}
return i;
}
BytesTrieBuilder::BTLinearMatchNode::BTLinearMatchNode(const char *bytes, int32_t len, Node *nextNode)
: LinearMatchNode(len, nextNode), s(bytes) {
hash=static_cast<int32_t>(
static_cast<uint32_t>(hash)*37u + static_cast<uint32_t>(ustr_hashCharsN(bytes, len)));
}
bool
BytesTrieBuilder::BTLinearMatchNode::operator==(const Node &other) const {
if(this==&other) {
return true;
}
if(!LinearMatchNode::operator==(other)) {
return false;
}
const BTLinearMatchNode &o=static_cast<const BTLinearMatchNode &>(other);
return 0==uprv_memcmp(s, o.s, length);
}
void
BytesTrieBuilder::BTLinearMatchNode::write(StringTrieBuilder &builder) {
BytesTrieBuilder &b=static_cast<BytesTrieBuilder &>(builder);
next->write(builder);
b.write(s, length);
offset=b.write(b.getMinLinearMatch()+length-1);
}
StringTrieBuilder::Node *
BytesTrieBuilder::createLinearMatchNode(int32_t i, int32_t byteIndex, int32_t length,
Node *nextNode) const {
return new BTLinearMatchNode(
elements[i].getString(*strings).data()+byteIndex,
length,
nextNode);
}
UBool
BytesTrieBuilder::ensureCapacity(int32_t length) {
if(bytes==nullptr) {
return false; // previous memory allocation had failed
}
if(length>bytesCapacity) {
int32_t newCapacity=bytesCapacity;
do {
newCapacity*=2;
} while(newCapacity<=length);
char *newBytes=static_cast<char *>(uprv_malloc(newCapacity));
if(newBytes==nullptr) {
// unable to allocate memory
uprv_free(bytes);
bytes=nullptr;
bytesCapacity=0;
return false;
}
uprv_memcpy(newBytes+(newCapacity-bytesLength),
bytes+(bytesCapacity-bytesLength), bytesLength);
uprv_free(bytes);
bytes=newBytes;
bytesCapacity=newCapacity;
}
return true;
}
int32_t
BytesTrieBuilder::write(int32_t byte) {
int32_t newLength=bytesLength+1;
if(ensureCapacity(newLength)) {
bytesLength=newLength;
bytes[bytesCapacity-bytesLength]=(char)byte;
}
return bytesLength;
}
int32_t
BytesTrieBuilder::write(const char *b, int32_t length) {
int32_t newLength=bytesLength+length;
if(ensureCapacity(newLength)) {
bytesLength=newLength;
uprv_memcpy(bytes+(bytesCapacity-bytesLength), b, length);
}
return bytesLength;
}
int32_t
BytesTrieBuilder::writeElementUnits(int32_t i, int32_t byteIndex, int32_t length) {
return write(elements[i].getString(*strings).data()+byteIndex, length);
}
int32_t
BytesTrieBuilder::writeValueAndFinal(int32_t i, UBool isFinal) {
if(0<=i && i<=BytesTrie::kMaxOneByteValue) {
return write(((BytesTrie::kMinOneByteValueLead+i)<<1)|isFinal);
}
char intBytes[5];
int32_t length=1;
if(i<0 || i>0xffffff) {
intBytes[0]=(char)BytesTrie::kFiveByteValueLead;
intBytes[1]=(char)((uint32_t)i>>24);
intBytes[2]=(char)((uint32_t)i>>16);
intBytes[3]=(char)((uint32_t)i>>8);
intBytes[4]=(char)i;
length=5;
// } else if(i<=BytesTrie::kMaxOneByteValue) {
// intBytes[0]=(char)(BytesTrie::kMinOneByteValueLead+i);
} else {
if(i<=BytesTrie::kMaxTwoByteValue) {
intBytes[0]=(char)(BytesTrie::kMinTwoByteValueLead+(i>>8));
} else {
if(i<=BytesTrie::kMaxThreeByteValue) {
intBytes[0]=(char)(BytesTrie::kMinThreeByteValueLead+(i>>16));
} else {
intBytes[0]=(char)BytesTrie::kFourByteValueLead;
intBytes[1]=(char)(i>>16);
length=2;
}
intBytes[length++]=(char)(i>>8);
}
intBytes[length++]=(char)i;
}
intBytes[0]=(char)((intBytes[0]<<1)|isFinal);
return write(intBytes, length);
}
int32_t
BytesTrieBuilder::writeValueAndType(UBool hasValue, int32_t value, int32_t node) {
int32_t offset=write(node);
if(hasValue) {
offset=writeValueAndFinal(value, false);
}
return offset;
}
int32_t
BytesTrieBuilder::writeDeltaTo(int32_t jumpTarget) {
int32_t i=bytesLength-jumpTarget;
U_ASSERT(i>=0);
if(i<=BytesTrie::kMaxOneByteDelta) {
return write(i);
} else {
char intBytes[5];
return write(intBytes, internalEncodeDelta(i, intBytes));
}
}
int32_t
BytesTrieBuilder::internalEncodeDelta(int32_t i, char intBytes[]) {
U_ASSERT(i>=0);
if(i<=BytesTrie::kMaxOneByteDelta) {
intBytes[0]=(char)i;
return 1;
}
int32_t length=1;
if(i<=BytesTrie::kMaxTwoByteDelta) {
intBytes[0]=(char)(BytesTrie::kMinTwoByteDeltaLead+(i>>8));
} else {
if(i<=BytesTrie::kMaxThreeByteDelta) {
intBytes[0]=(char)(BytesTrie::kMinThreeByteDeltaLead+(i>>16));
} else {
if(i<=0xffffff) {
intBytes[0]=(char)BytesTrie::kFourByteDeltaLead;
} else {
intBytes[0]=(char)BytesTrie::kFiveByteDeltaLead;
intBytes[1]=(char)(i>>24);
length=2;
}
intBytes[length++]=(char)(i>>16);
}
intBytes[length++]=(char)(i>>8);
}
intBytes[length++]=(char)i;
return length;
}
U_NAMESPACE_END

View file

@ -0,0 +1,214 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
* Copyright (C) 2010-2012, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* file name: bytestrieiterator.cpp
* encoding: UTF-8
* tab size: 8 (not used)
* indentation:4
*
* created on: 2010nov03
* created by: Markus W. Scherer
*/
#include "unicode/utypes.h"
#include "unicode/bytestrie.h"
#include "unicode/stringpiece.h"
#include "charstr.h"
#include "uvectr32.h"
U_NAMESPACE_BEGIN
BytesTrie::Iterator::Iterator(const void *trieBytes, int32_t maxStringLength,
UErrorCode &errorCode)
: bytes_(static_cast<const uint8_t *>(trieBytes)),
pos_(bytes_), initialPos_(bytes_),
remainingMatchLength_(-1), initialRemainingMatchLength_(-1),
str_(nullptr), maxLength_(maxStringLength), value_(0), stack_(nullptr) {
if(U_FAILURE(errorCode)) {
return;
}
// str_ and stack_ are pointers so that it's easy to turn bytestrie.h into
// a public API header for which we would want it to depend only on
// other public headers.
// Unlike BytesTrie itself, its Iterator performs memory allocations anyway
// via the CharString and UVector32 implementations, so this additional
// cost is minimal.
str_=new CharString();
stack_=new UVector32(errorCode);
if(U_SUCCESS(errorCode) && (str_==nullptr || stack_==nullptr)) {
errorCode=U_MEMORY_ALLOCATION_ERROR;
}
}
BytesTrie::Iterator::Iterator(const BytesTrie &trie, int32_t maxStringLength,
UErrorCode &errorCode)
: bytes_(trie.bytes_), pos_(trie.pos_), initialPos_(trie.pos_),
remainingMatchLength_(trie.remainingMatchLength_),
initialRemainingMatchLength_(trie.remainingMatchLength_),
str_(nullptr), maxLength_(maxStringLength), value_(0), stack_(nullptr) {
if(U_FAILURE(errorCode)) {
return;
}
str_=new CharString();
stack_=new UVector32(errorCode);
if(U_FAILURE(errorCode)) {
return;
}
if(str_==nullptr || stack_==nullptr) {
errorCode=U_MEMORY_ALLOCATION_ERROR;
return;
}
int32_t length=remainingMatchLength_; // Actual remaining match length minus 1.
if(length>=0) {
// Pending linear-match node, append remaining bytes to str_.
++length;
if(maxLength_>0 && length>maxLength_) {
length=maxLength_; // This will leave remainingMatchLength>=0 as a signal.
}
str_->append(reinterpret_cast<const char *>(pos_), length, errorCode);
pos_+=length;
remainingMatchLength_-=length;
}
}
BytesTrie::Iterator::~Iterator() {
delete str_;
delete stack_;
}
BytesTrie::Iterator &
BytesTrie::Iterator::reset() {
pos_=initialPos_;
remainingMatchLength_=initialRemainingMatchLength_;
int32_t length=remainingMatchLength_+1; // Remaining match length.
if(maxLength_>0 && length>maxLength_) {
length=maxLength_;
}
str_->truncate(length);
pos_+=length;
remainingMatchLength_-=length;
stack_->setSize(0);
return *this;
}
UBool
BytesTrie::Iterator::hasNext() const { return pos_!=nullptr || !stack_->isEmpty(); }
UBool
BytesTrie::Iterator::next(UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) {
return false;
}
const uint8_t *pos=pos_;
if(pos==nullptr) {
if(stack_->isEmpty()) {
return false;
}
// Pop the state off the stack and continue with the next outbound edge of
// the branch node.
int32_t stackSize=stack_->size();
int32_t length=stack_->elementAti(stackSize-1);
pos=bytes_+stack_->elementAti(stackSize-2);
stack_->setSize(stackSize-2);
str_->truncate(length&0xffff);
length=(int32_t)((uint32_t)length>>16);
if(length>1) {
pos=branchNext(pos, length, errorCode);
if(pos==nullptr) {
return true; // Reached a final value.
}
} else {
str_->append((char)*pos++, errorCode);
}
}
if(remainingMatchLength_>=0) {
// We only get here if we started in a pending linear-match node
// with more than maxLength remaining bytes.
return truncateAndStop();
}
for(;;) {
int32_t node=*pos++;
if(node>=kMinValueLead) {
// Deliver value for the byte sequence so far.
UBool isFinal=(UBool)(node&kValueIsFinal);
value_=readValue(pos, node>>1);
if(isFinal || (maxLength_>0 && str_->length()==maxLength_)) {
pos_=nullptr;
} else {
pos_=skipValue(pos, node);
}
return true;
}
if(maxLength_>0 && str_->length()==maxLength_) {
return truncateAndStop();
}
if(node<kMinLinearMatch) {
if(node==0) {
node=*pos++;
}
pos=branchNext(pos, node+1, errorCode);
if(pos==nullptr) {
return true; // Reached a final value.
}
} else {
// Linear-match node, append length bytes to str_.
int32_t length=node-kMinLinearMatch+1;
if(maxLength_>0 && str_->length()+length>maxLength_) {
str_->append(reinterpret_cast<const char *>(pos),
maxLength_-str_->length(), errorCode);
return truncateAndStop();
}
str_->append(reinterpret_cast<const char *>(pos), length, errorCode);
pos+=length;
}
}
}
StringPiece
BytesTrie::Iterator::getString() const {
return str_ == nullptr ? StringPiece() : str_->toStringPiece();
}
UBool
BytesTrie::Iterator::truncateAndStop() {
pos_=nullptr;
value_=-1; // no real value for str
return true;
}
// Branch node, needs to take the first outbound edge and push state for the rest.
const uint8_t *
BytesTrie::Iterator::branchNext(const uint8_t *pos, int32_t length, UErrorCode &errorCode) {
while(length>kMaxBranchLinearSubNodeLength) {
++pos; // ignore the comparison byte
// Push state for the greater-or-equal edge.
stack_->addElement((int32_t)(skipDelta(pos)-bytes_), errorCode);
stack_->addElement(((length-(length>>1))<<16)|str_->length(), errorCode);
// Follow the less-than edge.
length>>=1;
pos=jumpByDelta(pos);
}
// List of key-value pairs where values are either final values or jump deltas.
// Read the first (key, value) pair.
uint8_t trieByte=*pos++;
int32_t node=*pos++;
UBool isFinal=(UBool)(node&kValueIsFinal);
int32_t value=readValue(pos, node>>1);
pos=skipValue(pos, node);
stack_->addElement((int32_t)(pos-bytes_), errorCode);
stack_->addElement(((length-1)<<16)|str_->length(), errorCode);
str_->append((char)trieByte, errorCode);
if(isFinal) {
pos_=nullptr;
value_=value;
return nullptr;
} else {
return pos+value;
}
}
U_NAMESPACE_END

View file

@ -0,0 +1,594 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*****************************************************************************
* Copyright (C) 1996-2015, International Business Machines Corporation and
* others. All Rights Reserved.
*****************************************************************************
*/
#include "unicode/utypes.h"
#if !UCONFIG_NO_NORMALIZATION
#include "unicode/caniter.h"
#include "unicode/normalizer2.h"
#include "unicode/uchar.h"
#include "unicode/uniset.h"
#include "unicode/usetiter.h"
#include "unicode/ustring.h"
#include "unicode/utf16.h"
#include "cmemory.h"
#include "hash.h"
#include "normalizer2impl.h"
/**
* This class allows one to iterate through all the strings that are canonically equivalent to a given
* string. For example, here are some sample results:
Results for: {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA}
1: \u0041\u030A\u0064\u0307\u0327
= {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA}
2: \u0041\u030A\u0064\u0327\u0307
= {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D}{COMBINING CEDILLA}{COMBINING DOT ABOVE}
3: \u0041\u030A\u1E0B\u0327
= {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D WITH DOT ABOVE}{COMBINING CEDILLA}
4: \u0041\u030A\u1E11\u0307
= {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D WITH CEDILLA}{COMBINING DOT ABOVE}
5: \u00C5\u0064\u0307\u0327
= {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA}
6: \u00C5\u0064\u0327\u0307
= {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D}{COMBINING CEDILLA}{COMBINING DOT ABOVE}
7: \u00C5\u1E0B\u0327
= {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D WITH DOT ABOVE}{COMBINING CEDILLA}
8: \u00C5\u1E11\u0307
= {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D WITH CEDILLA}{COMBINING DOT ABOVE}
9: \u212B\u0064\u0307\u0327
= {ANGSTROM SIGN}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA}
10: \u212B\u0064\u0327\u0307
= {ANGSTROM SIGN}{LATIN SMALL LETTER D}{COMBINING CEDILLA}{COMBINING DOT ABOVE}
11: \u212B\u1E0B\u0327
= {ANGSTROM SIGN}{LATIN SMALL LETTER D WITH DOT ABOVE}{COMBINING CEDILLA}
12: \u212B\u1E11\u0307
= {ANGSTROM SIGN}{LATIN SMALL LETTER D WITH CEDILLA}{COMBINING DOT ABOVE}
*<br>Note: the code is intended for use with small strings, and is not suitable for larger ones,
* since it has not been optimized for that situation.
*@author M. Davis
*@draft
*/
// public
U_NAMESPACE_BEGIN
// TODO: add boilerplate methods.
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CanonicalIterator)
/**
*@param source string to get results for
*/
CanonicalIterator::CanonicalIterator(const UnicodeString &sourceStr, UErrorCode &status) :
pieces(nullptr),
pieces_length(0),
pieces_lengths(nullptr),
current(nullptr),
current_length(0),
nfd(Normalizer2::getNFDInstance(status)),
nfcImpl(Normalizer2Factory::getNFCImpl(status))
{
if(U_SUCCESS(status) && nfcImpl->ensureCanonIterData(status)) {
setSource(sourceStr, status);
}
}
CanonicalIterator::~CanonicalIterator() {
cleanPieces();
}
void CanonicalIterator::cleanPieces() {
int32_t i = 0;
if(pieces != nullptr) {
for(i = 0; i < pieces_length; i++) {
if(pieces[i] != nullptr) {
delete[] pieces[i];
}
}
uprv_free(pieces);
pieces = nullptr;
pieces_length = 0;
}
if(pieces_lengths != nullptr) {
uprv_free(pieces_lengths);
pieces_lengths = nullptr;
}
if(current != nullptr) {
uprv_free(current);
current = nullptr;
current_length = 0;
}
}
/**
*@return gets the source: NOTE: it is the NFD form of source
*/
UnicodeString CanonicalIterator::getSource() {
return source;
}
/**
* Resets the iterator so that one can start again from the beginning.
*/
void CanonicalIterator::reset() {
done = false;
for (int i = 0; i < current_length; ++i) {
current[i] = 0;
}
}
/**
*@return the next string that is canonically equivalent. The value null is returned when
* the iteration is done.
*/
UnicodeString CanonicalIterator::next() {
int32_t i = 0;
if (done) {
buffer.setToBogus();
return buffer;
}
// delete old contents
buffer.remove();
// construct return value
for (i = 0; i < pieces_length; ++i) {
buffer.append(pieces[i][current[i]]);
}
//String result = buffer.toString(); // not needed
// find next value for next time
for (i = current_length - 1; ; --i) {
if (i < 0) {
done = true;
break;
}
current[i]++;
if (current[i] < pieces_lengths[i]) break; // got sequence
current[i] = 0;
}
return buffer;
}
/**
*@param set the source string to iterate against. This allows the same iterator to be used
* while changing the source string, saving object creation.
*/
void CanonicalIterator::setSource(const UnicodeString &newSource, UErrorCode &status) {
int32_t list_length = 0;
UChar32 cp = 0;
int32_t start = 0;
int32_t i = 0;
UnicodeString *list = nullptr;
nfd->normalize(newSource, source, status);
if(U_FAILURE(status)) {
return;
}
done = false;
cleanPieces();
// catch degenerate case
if (newSource.length() == 0) {
pieces = (UnicodeString **)uprv_malloc(sizeof(UnicodeString *));
pieces_lengths = (int32_t*)uprv_malloc(1 * sizeof(int32_t));
pieces_length = 1;
current = (int32_t*)uprv_malloc(1 * sizeof(int32_t));
current_length = 1;
if (pieces == nullptr || pieces_lengths == nullptr || current == nullptr) {
status = U_MEMORY_ALLOCATION_ERROR;
goto CleanPartialInitialization;
}
current[0] = 0;
pieces[0] = new UnicodeString[1];
pieces_lengths[0] = 1;
if (pieces[0] == nullptr) {
status = U_MEMORY_ALLOCATION_ERROR;
goto CleanPartialInitialization;
}
return;
}
list = new UnicodeString[source.length()];
if (list == nullptr) {
status = U_MEMORY_ALLOCATION_ERROR;
goto CleanPartialInitialization;
}
// i should initially be the number of code units at the
// start of the string
i = U16_LENGTH(source.char32At(0));
// int32_t i = 1;
// find the segments
// This code iterates through the source string and
// extracts segments that end up on a codepoint that
// doesn't start any decompositions. (Analysis is done
// on the NFD form - see above).
for (; i < source.length(); i += U16_LENGTH(cp)) {
cp = source.char32At(i);
if (nfcImpl->isCanonSegmentStarter(cp)) {
source.extract(start, i-start, list[list_length++]); // add up to i
start = i;
}
}
source.extract(start, i-start, list[list_length++]); // add last one
// allocate the arrays, and find the strings that are CE to each segment
pieces = (UnicodeString **)uprv_malloc(list_length * sizeof(UnicodeString *));
pieces_length = list_length;
pieces_lengths = (int32_t*)uprv_malloc(list_length * sizeof(int32_t));
current = (int32_t*)uprv_malloc(list_length * sizeof(int32_t));
current_length = list_length;
if (pieces == nullptr || pieces_lengths == nullptr || current == nullptr) {
status = U_MEMORY_ALLOCATION_ERROR;
goto CleanPartialInitialization;
}
for (i = 0; i < current_length; i++) {
current[i] = 0;
}
// for each segment, get all the combinations that can produce
// it after NFD normalization
for (i = 0; i < pieces_length; ++i) {
//if (PROGRESS) printf("SEGMENT\n");
pieces[i] = getEquivalents(list[i], pieces_lengths[i], status);
}
delete[] list;
return;
// Common section to cleanup all local variables and reset object variables.
CleanPartialInitialization:
delete[] list;
cleanPieces();
}
/**
* Dumb recursive implementation of permutation.
* TODO: optimize
* @param source the string to find permutations for
* @return the results in a set.
*/
void U_EXPORT2 CanonicalIterator::permute(UnicodeString &source, UBool skipZeros, Hashtable *result, UErrorCode &status, int32_t depth) {
if(U_FAILURE(status)) {
return;
}
// To avoid infinity loop caused by permute, we limit the depth of recursive
// call to permute and return U_UNSUPPORTED_ERROR.
// We know in some unit test we need at least 4. Set to 8 just in case some
// unforseen use cases.
constexpr int32_t kPermuteDepthLimit = 8;
if (depth > kPermuteDepthLimit) {
status = U_UNSUPPORTED_ERROR;
return;
}
//if (PROGRESS) printf("Permute: %s\n", UToS(Tr(source)));
int32_t i = 0;
// optimization:
// if zero or one character, just return a set with it
// we check for length < 2 to keep from counting code points all the time
if (source.length() <= 2 && source.countChar32() <= 1) {
UnicodeString *toPut = new UnicodeString(source);
/* test for nullptr */
if (toPut == nullptr) {
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
result->put(source, toPut, status);
return;
}
// otherwise iterate through the string, and recursively permute all the other characters
UChar32 cp;
Hashtable subpermute(status);
if(U_FAILURE(status)) {
return;
}
subpermute.setValueDeleter(uprv_deleteUObject);
for (i = 0; i < source.length(); i += U16_LENGTH(cp)) {
cp = source.char32At(i);
const UHashElement *ne = nullptr;
int32_t el = UHASH_FIRST;
UnicodeString subPermuteString = source;
// optimization:
// if the character is canonical combining class zero,
// don't permute it
if (skipZeros && i != 0 && u_getCombiningClass(cp) == 0) {
//System.out.println("Skipping " + Utility.hex(UTF16.valueOf(source, i)));
continue;
}
subpermute.removeAll();
// see what the permutations of the characters before and after this one are
//Hashtable *subpermute = permute(source.substring(0,i) + source.substring(i + UTF16.getCharCount(cp)));
permute(subPermuteString.remove(i, U16_LENGTH(cp)), skipZeros, &subpermute, status, depth+1);
/* Test for buffer overflows */
if(U_FAILURE(status)) {
return;
}
// The upper remove is destructive. The question is do we have to make a copy, or we don't care about the contents
// of source at this point.
// prefix this character to all of them
ne = subpermute.nextElement(el);
while (ne != nullptr) {
UnicodeString *permRes = (UnicodeString *)(ne->value.pointer);
UnicodeString *chStr = new UnicodeString(cp);
//test for nullptr
if (chStr == nullptr) {
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
chStr->append(*permRes); //*((UnicodeString *)(ne->value.pointer));
//if (PROGRESS) printf(" Piece: %s\n", UToS(*chStr));
result->put(*chStr, chStr, status);
ne = subpermute.nextElement(el);
}
}
//return result;
}
// privates
// we have a segment, in NFD. Find all the strings that are canonically equivalent to it.
UnicodeString* CanonicalIterator::getEquivalents(const UnicodeString &segment, int32_t &result_len, UErrorCode &status) {
Hashtable result(status);
Hashtable permutations(status);
Hashtable basic(status);
if (U_FAILURE(status)) {
return nullptr;
}
result.setValueDeleter(uprv_deleteUObject);
permutations.setValueDeleter(uprv_deleteUObject);
basic.setValueDeleter(uprv_deleteUObject);
char16_t USeg[256];
int32_t segLen = segment.extract(USeg, 256, status);
getEquivalents2(&basic, USeg, segLen, status);
// now get all the permutations
// add only the ones that are canonically equivalent
// TODO: optimize by not permuting any class zero.
const UHashElement *ne = nullptr;
int32_t el = UHASH_FIRST;
//Iterator it = basic.iterator();
ne = basic.nextElement(el);
//while (it.hasNext())
while (ne != nullptr) {
//String item = (String) it.next();
UnicodeString item = *((UnicodeString *)(ne->value.pointer));
permutations.removeAll();
permute(item, CANITER_SKIP_ZEROES, &permutations, status);
const UHashElement *ne2 = nullptr;
int32_t el2 = UHASH_FIRST;
//Iterator it2 = permutations.iterator();
ne2 = permutations.nextElement(el2);
//while (it2.hasNext())
while (ne2 != nullptr) {
//String possible = (String) it2.next();
//UnicodeString *possible = new UnicodeString(*((UnicodeString *)(ne2->value.pointer)));
UnicodeString possible(*((UnicodeString *)(ne2->value.pointer)));
UnicodeString attempt;
nfd->normalize(possible, attempt, status);
// TODO: check if operator == is semanticaly the same as attempt.equals(segment)
if (attempt==segment) {
//if (PROGRESS) printf("Adding Permutation: %s\n", UToS(Tr(*possible)));
// TODO: use the hashtable just to catch duplicates - store strings directly (somehow).
result.put(possible, new UnicodeString(possible), status); //add(possible);
} else {
//if (PROGRESS) printf("-Skipping Permutation: %s\n", UToS(Tr(*possible)));
}
ne2 = permutations.nextElement(el2);
}
ne = basic.nextElement(el);
}
/* Test for buffer overflows */
if(U_FAILURE(status)) {
return nullptr;
}
// convert into a String[] to clean up storage
//String[] finalResult = new String[result.size()];
UnicodeString *finalResult = nullptr;
int32_t resultCount;
if((resultCount = result.count()) != 0) {
finalResult = new UnicodeString[resultCount];
if (finalResult == nullptr) {
status = U_MEMORY_ALLOCATION_ERROR;
return nullptr;
}
}
else {
status = U_ILLEGAL_ARGUMENT_ERROR;
return nullptr;
}
//result.toArray(finalResult);
result_len = 0;
el = UHASH_FIRST;
ne = result.nextElement(el);
while(ne != nullptr) {
finalResult[result_len++] = *((UnicodeString *)(ne->value.pointer));
ne = result.nextElement(el);
}
return finalResult;
}
Hashtable *CanonicalIterator::getEquivalents2(Hashtable *fillinResult, const char16_t *segment, int32_t segLen, UErrorCode &status) {
if (U_FAILURE(status)) {
return nullptr;
}
//if (PROGRESS) printf("Adding: %s\n", UToS(Tr(segment)));
UnicodeString toPut(segment, segLen);
fillinResult->put(toPut, new UnicodeString(toPut), status);
UnicodeSet starts;
// cycle through all the characters
UChar32 cp;
for (int32_t i = 0; i < segLen; i += U16_LENGTH(cp)) {
// see if any character is at the start of some decomposition
U16_GET(segment, 0, i, segLen, cp);
if (!nfcImpl->getCanonStartSet(cp, starts)) {
continue;
}
// if so, see which decompositions match
UnicodeSetIterator iter(starts);
while (iter.next()) {
UChar32 cp2 = iter.getCodepoint();
Hashtable remainder(status);
remainder.setValueDeleter(uprv_deleteUObject);
if (extract(&remainder, cp2, segment, segLen, i, status) == nullptr) {
continue;
}
// there were some matches, so add all the possibilities to the set.
UnicodeString prefix(segment, i);
prefix += cp2;
int32_t el = UHASH_FIRST;
const UHashElement *ne = remainder.nextElement(el);
while (ne != nullptr) {
UnicodeString item = *((UnicodeString *)(ne->value.pointer));
UnicodeString *toAdd = new UnicodeString(prefix);
/* test for nullptr */
if (toAdd == nullptr) {
status = U_MEMORY_ALLOCATION_ERROR;
return nullptr;
}
*toAdd += item;
fillinResult->put(*toAdd, toAdd, status);
//if (PROGRESS) printf("Adding: %s\n", UToS(Tr(*toAdd)));
ne = remainder.nextElement(el);
}
}
}
/* Test for buffer overflows */
if(U_FAILURE(status)) {
return nullptr;
}
return fillinResult;
}
/**
* See if the decomposition of cp2 is at segment starting at segmentPos
* (with canonical rearrangement!)
* If so, take the remainder, and return the equivalents
*/
Hashtable *CanonicalIterator::extract(Hashtable *fillinResult, UChar32 comp, const char16_t *segment, int32_t segLen, int32_t segmentPos, UErrorCode &status) {
//Hashtable *CanonicalIterator::extract(UChar32 comp, const UnicodeString &segment, int32_t segLen, int32_t segmentPos, UErrorCode &status) {
//if (PROGRESS) printf(" extract: %s, ", UToS(Tr(UnicodeString(comp))));
//if (PROGRESS) printf("%s, %i\n", UToS(Tr(segment)), segmentPos);
if (U_FAILURE(status)) {
return nullptr;
}
UnicodeString temp(comp);
int32_t inputLen=temp.length();
UnicodeString decompString;
nfd->normalize(temp, decompString, status);
if (U_FAILURE(status)) {
return nullptr;
}
if (decompString.isBogus()) {
status = U_MEMORY_ALLOCATION_ERROR;
return nullptr;
}
const char16_t *decomp=decompString.getBuffer();
int32_t decompLen=decompString.length();
// See if it matches the start of segment (at segmentPos)
UBool ok = false;
UChar32 cp;
int32_t decompPos = 0;
UChar32 decompCp;
U16_NEXT(decomp, decompPos, decompLen, decompCp);
int32_t i = segmentPos;
while(i < segLen) {
U16_NEXT(segment, i, segLen, cp);
if (cp == decompCp) { // if equal, eat another cp from decomp
//if (PROGRESS) printf(" matches: %s\n", UToS(Tr(UnicodeString(cp))));
if (decompPos == decompLen) { // done, have all decomp characters!
temp.append(segment+i, segLen-i);
ok = true;
break;
}
U16_NEXT(decomp, decompPos, decompLen, decompCp);
} else {
//if (PROGRESS) printf(" buffer: %s\n", UToS(Tr(UnicodeString(cp))));
// brute force approach
temp.append(cp);
/* TODO: optimize
// since we know that the classes are monotonically increasing, after zero
// e.g. 0 5 7 9 0 3
// we can do an optimization
// there are only a few cases that work: zero, less, same, greater
// if both classes are the same, we fail
// if the decomp class < the segment class, we fail
segClass = getClass(cp);
if (decompClass <= segClass) return null;
*/
}
}
if (!ok)
return nullptr; // we failed, characters left over
//if (PROGRESS) printf("Matches\n");
if (inputLen == temp.length()) {
fillinResult->put(UnicodeString(), new UnicodeString(), status);
return fillinResult; // succeed, but no remainder
}
// brute force approach
// check to make sure result is canonically equivalent
UnicodeString trial;
nfd->normalize(temp, trial, status);
if(U_FAILURE(status) || trial.compare(segment+segmentPos, segLen - segmentPos) != 0) {
return nullptr;
}
return getEquivalents2(fillinResult, temp.getBuffer()+inputLen, temp.length()-inputLen, status);
}
U_NAMESPACE_END
#endif /* #if !UCONFIG_NO_NORMALIZATION */

View file

@ -0,0 +1,97 @@
// © 2018 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
#ifndef __CAPI_HELPER_H__
#define __CAPI_HELPER_H__
#include "unicode/utypes.h"
U_NAMESPACE_BEGIN
/**
* An internal helper class to help convert between C and C++ APIs.
*/
template<typename CType, typename CPPType, int32_t kMagic>
class IcuCApiHelper {
public:
/**
* Convert from the C type to the C++ type (const version).
*/
static const CPPType* validate(const CType* input, UErrorCode& status);
/**
* Convert from the C type to the C++ type (non-const version).
*/
static CPPType* validate(CType* input, UErrorCode& status);
/**
* Convert from the C++ type to the C type (const version).
*/
const CType* exportConstForC() const;
/**
* Convert from the C++ type to the C type (non-const version).
*/
CType* exportForC();
/**
* Invalidates the object.
*/
~IcuCApiHelper();
private:
/**
* While the object is valid, fMagic equals kMagic.
*/
int32_t fMagic = kMagic;
};
template<typename CType, typename CPPType, int32_t kMagic>
const CPPType*
IcuCApiHelper<CType, CPPType, kMagic>::validate(const CType* input, UErrorCode& status) {
if (U_FAILURE(status)) {
return nullptr;
}
if (input == nullptr) {
status = U_ILLEGAL_ARGUMENT_ERROR;
return nullptr;
}
auto* impl = reinterpret_cast<const CPPType*>(input);
if (static_cast<const IcuCApiHelper<CType, CPPType, kMagic>*>(impl)->fMagic != kMagic) {
status = U_INVALID_FORMAT_ERROR;
return nullptr;
}
return impl;
}
template<typename CType, typename CPPType, int32_t kMagic>
CPPType*
IcuCApiHelper<CType, CPPType, kMagic>::validate(CType* input, UErrorCode& status) {
auto* constInput = static_cast<const CType*>(input);
auto* validated = validate(constInput, status);
return const_cast<CPPType*>(validated);
}
template<typename CType, typename CPPType, int32_t kMagic>
const CType*
IcuCApiHelper<CType, CPPType, kMagic>::exportConstForC() const {
return reinterpret_cast<const CType*>(static_cast<const CPPType*>(this));
}
template<typename CType, typename CPPType, int32_t kMagic>
CType*
IcuCApiHelper<CType, CPPType, kMagic>::exportForC() {
return reinterpret_cast<CType*>(static_cast<CPPType*>(this));
}
template<typename CType, typename CPPType, int32_t kMagic>
IcuCApiHelper<CType, CPPType, kMagic>::~IcuCApiHelper() {
// head off application errors by preventing use of of deleted objects.
fMagic = 0;
}
U_NAMESPACE_END
#endif // __CAPI_HELPER_H__

View file

@ -0,0 +1,427 @@
// © 2018 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
// characterproperties.cpp
// created: 2018sep03 Markus W. Scherer
#include "unicode/utypes.h"
#include "unicode/localpointer.h"
#include "unicode/uchar.h"
#include "unicode/ucpmap.h"
#include "unicode/ucptrie.h"
#include "unicode/umutablecptrie.h"
#include "unicode/uniset.h"
#include "unicode/uscript.h"
#include "unicode/uset.h"
#include "cmemory.h"
#include "emojiprops.h"
#include "mutex.h"
#include "normalizer2impl.h"
#include "uassert.h"
#include "ubidi_props.h"
#include "ucase.h"
#include "ucln_cmn.h"
#include "umutex.h"
#include "uprops.h"
using icu::LocalPointer;
#if !UCONFIG_NO_NORMALIZATION
using icu::Normalizer2Factory;
using icu::Normalizer2Impl;
#endif
using icu::UInitOnce;
using icu::UnicodeSet;
namespace {
UBool U_CALLCONV characterproperties_cleanup();
constexpr int32_t NUM_INCLUSIONS = UPROPS_SRC_COUNT + (UCHAR_INT_LIMIT - UCHAR_INT_START);
struct Inclusion {
UnicodeSet *fSet = nullptr;
UInitOnce fInitOnce {};
};
Inclusion gInclusions[NUM_INCLUSIONS]; // cached getInclusions()
UnicodeSet *sets[UCHAR_BINARY_LIMIT] = {};
UCPMap *maps[UCHAR_INT_LIMIT - UCHAR_INT_START] = {};
icu::UMutex cpMutex;
//----------------------------------------------------------------
// Inclusions list
//----------------------------------------------------------------
// USetAdder implementation
// Does not use uset.h to reduce code dependencies
void U_CALLCONV
_set_add(USet *set, UChar32 c) {
((UnicodeSet *)set)->add(c);
}
void U_CALLCONV
_set_addRange(USet *set, UChar32 start, UChar32 end) {
((UnicodeSet *)set)->add(start, end);
}
void U_CALLCONV
_set_addString(USet *set, const char16_t *str, int32_t length) {
((UnicodeSet *)set)->add(icu::UnicodeString((UBool)(length<0), str, length));
}
UBool U_CALLCONV characterproperties_cleanup() {
for (Inclusion &in: gInclusions) {
delete in.fSet;
in.fSet = nullptr;
in.fInitOnce.reset();
}
for (int32_t i = 0; i < UPRV_LENGTHOF(sets); ++i) {
delete sets[i];
sets[i] = nullptr;
}
for (int32_t i = 0; i < UPRV_LENGTHOF(maps); ++i) {
ucptrie_close(reinterpret_cast<UCPTrie *>(maps[i]));
maps[i] = nullptr;
}
return true;
}
void U_CALLCONV initInclusion(UPropertySource src, UErrorCode &errorCode) {
// This function is invoked only via umtx_initOnce().
U_ASSERT(0 <= src && src < UPROPS_SRC_COUNT);
if (src == UPROPS_SRC_NONE) {
errorCode = U_INTERNAL_PROGRAM_ERROR;
return;
}
U_ASSERT(gInclusions[src].fSet == nullptr);
LocalPointer<UnicodeSet> incl(new UnicodeSet());
if (incl.isNull()) {
errorCode = U_MEMORY_ALLOCATION_ERROR;
return;
}
USetAdder sa = {
(USet *)incl.getAlias(),
_set_add,
_set_addRange,
_set_addString,
nullptr, // don't need remove()
nullptr // don't need removeRange()
};
switch(src) {
case UPROPS_SRC_CHAR:
uchar_addPropertyStarts(&sa, &errorCode);
break;
case UPROPS_SRC_PROPSVEC:
upropsvec_addPropertyStarts(&sa, &errorCode);
break;
case UPROPS_SRC_CHAR_AND_PROPSVEC:
uchar_addPropertyStarts(&sa, &errorCode);
upropsvec_addPropertyStarts(&sa, &errorCode);
break;
#if !UCONFIG_NO_NORMALIZATION
case UPROPS_SRC_CASE_AND_NORM: {
const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
if(U_SUCCESS(errorCode)) {
impl->addPropertyStarts(&sa, errorCode);
}
ucase_addPropertyStarts(&sa, &errorCode);
break;
}
case UPROPS_SRC_NFC: {
const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
if(U_SUCCESS(errorCode)) {
impl->addPropertyStarts(&sa, errorCode);
}
break;
}
case UPROPS_SRC_NFKC: {
const Normalizer2Impl *impl=Normalizer2Factory::getNFKCImpl(errorCode);
if(U_SUCCESS(errorCode)) {
impl->addPropertyStarts(&sa, errorCode);
}
break;
}
case UPROPS_SRC_NFKC_CF: {
const Normalizer2Impl *impl=Normalizer2Factory::getNFKC_CFImpl(errorCode);
if(U_SUCCESS(errorCode)) {
impl->addPropertyStarts(&sa, errorCode);
}
break;
}
case UPROPS_SRC_NFC_CANON_ITER: {
const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
if(U_SUCCESS(errorCode)) {
impl->addCanonIterPropertyStarts(&sa, errorCode);
}
break;
}
#endif
case UPROPS_SRC_CASE:
ucase_addPropertyStarts(&sa, &errorCode);
break;
case UPROPS_SRC_BIDI:
ubidi_addPropertyStarts(&sa, &errorCode);
break;
case UPROPS_SRC_INPC:
case UPROPS_SRC_INSC:
case UPROPS_SRC_VO:
uprops_addPropertyStarts(src, &sa, &errorCode);
break;
case UPROPS_SRC_EMOJI: {
const icu::EmojiProps *ep = icu::EmojiProps::getSingleton(errorCode);
if (U_SUCCESS(errorCode)) {
ep->addPropertyStarts(&sa, errorCode);
}
break;
}
case UPROPS_SRC_IDSU:
// New in Unicode 15.1 for just two characters.
sa.add(sa.set, 0x2FFE);
sa.add(sa.set, 0x2FFF + 1);
break;
case UPROPS_SRC_ID_COMPAT_MATH:
uprops_addPropertyStarts(src, &sa, &errorCode);
break;
default:
errorCode = U_INTERNAL_PROGRAM_ERROR;
break;
}
if (U_FAILURE(errorCode)) {
return;
}
if (incl->isBogus()) {
errorCode = U_MEMORY_ALLOCATION_ERROR;
return;
}
// Compact for caching.
incl->compact();
gInclusions[src].fSet = incl.orphan();
ucln_common_registerCleanup(UCLN_COMMON_CHARACTERPROPERTIES, characterproperties_cleanup);
}
const UnicodeSet *getInclusionsForSource(UPropertySource src, UErrorCode &errorCode) {
if (U_FAILURE(errorCode)) { return nullptr; }
if (src < 0 || UPROPS_SRC_COUNT <= src) {
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
return nullptr;
}
Inclusion &i = gInclusions[src];
umtx_initOnce(i.fInitOnce, &initInclusion, src, errorCode);
return i.fSet;
}
void U_CALLCONV initIntPropInclusion(UProperty prop, UErrorCode &errorCode) {
// This function is invoked only via umtx_initOnce().
U_ASSERT(UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT);
int32_t inclIndex = UPROPS_SRC_COUNT + (prop - UCHAR_INT_START);
U_ASSERT(gInclusions[inclIndex].fSet == nullptr);
UPropertySource src = uprops_getSource(prop);
const UnicodeSet *incl = getInclusionsForSource(src, errorCode);
if (U_FAILURE(errorCode)) {
return;
}
LocalPointer<UnicodeSet> intPropIncl(new UnicodeSet(0, 0));
if (intPropIncl.isNull()) {
errorCode = U_MEMORY_ALLOCATION_ERROR;
return;
}
int32_t numRanges = incl->getRangeCount();
int32_t prevValue = 0;
for (int32_t i = 0; i < numRanges; ++i) {
UChar32 rangeEnd = incl->getRangeEnd(i);
for (UChar32 c = incl->getRangeStart(i); c <= rangeEnd; ++c) {
// TODO: Get a UCharacterProperty.IntProperty to avoid the property dispatch.
int32_t value = u_getIntPropertyValue(c, prop);
if (value != prevValue) {
intPropIncl->add(c);
prevValue = value;
}
}
}
if (intPropIncl->isBogus()) {
errorCode = U_MEMORY_ALLOCATION_ERROR;
return;
}
// Compact for caching.
intPropIncl->compact();
gInclusions[inclIndex].fSet = intPropIncl.orphan();
ucln_common_registerCleanup(UCLN_COMMON_CHARACTERPROPERTIES, characterproperties_cleanup);
}
} // namespace
U_NAMESPACE_BEGIN
const UnicodeSet *CharacterProperties::getInclusionsForProperty(
UProperty prop, UErrorCode &errorCode) {
if (U_FAILURE(errorCode)) { return nullptr; }
if (UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT) {
int32_t inclIndex = UPROPS_SRC_COUNT + (prop - UCHAR_INT_START);
Inclusion &i = gInclusions[inclIndex];
umtx_initOnce(i.fInitOnce, &initIntPropInclusion, prop, errorCode);
return i.fSet;
} else {
UPropertySource src = uprops_getSource(prop);
return getInclusionsForSource(src, errorCode);
}
}
U_NAMESPACE_END
namespace {
UnicodeSet *makeSet(UProperty property, UErrorCode &errorCode) {
if (U_FAILURE(errorCode)) { return nullptr; }
LocalPointer<UnicodeSet> set(new UnicodeSet());
if (set.isNull()) {
errorCode = U_MEMORY_ALLOCATION_ERROR;
return nullptr;
}
if (UCHAR_BASIC_EMOJI <= property && property <= UCHAR_RGI_EMOJI) {
// property of strings
const icu::EmojiProps *ep = icu::EmojiProps::getSingleton(errorCode);
if (U_FAILURE(errorCode)) { return nullptr; }
USetAdder sa = {
(USet *)set.getAlias(),
_set_add,
_set_addRange,
_set_addString,
nullptr, // don't need remove()
nullptr // don't need removeRange()
};
ep->addStrings(&sa, property, errorCode);
if (property != UCHAR_BASIC_EMOJI && property != UCHAR_RGI_EMOJI) {
// property of _only_ strings
set->freeze();
return set.orphan();
}
}
const UnicodeSet *inclusions =
icu::CharacterProperties::getInclusionsForProperty(property, errorCode);
if (U_FAILURE(errorCode)) { return nullptr; }
int32_t numRanges = inclusions->getRangeCount();
UChar32 startHasProperty = -1;
for (int32_t i = 0; i < numRanges; ++i) {
UChar32 rangeEnd = inclusions->getRangeEnd(i);
for (UChar32 c = inclusions->getRangeStart(i); c <= rangeEnd; ++c) {
// TODO: Get a UCharacterProperty.BinaryProperty to avoid the property dispatch.
if (u_hasBinaryProperty(c, property)) {
if (startHasProperty < 0) {
// Transition from false to true.
startHasProperty = c;
}
} else if (startHasProperty >= 0) {
// Transition from true to false.
set->add(startHasProperty, c - 1);
startHasProperty = -1;
}
}
}
if (startHasProperty >= 0) {
set->add(startHasProperty, 0x10FFFF);
}
set->freeze();
return set.orphan();
}
UCPMap *makeMap(UProperty property, UErrorCode &errorCode) {
if (U_FAILURE(errorCode)) { return nullptr; }
uint32_t nullValue = property == UCHAR_SCRIPT ? USCRIPT_UNKNOWN : 0;
icu::LocalUMutableCPTriePointer mutableTrie(
umutablecptrie_open(nullValue, nullValue, &errorCode));
const UnicodeSet *inclusions =
icu::CharacterProperties::getInclusionsForProperty(property, errorCode);
if (U_FAILURE(errorCode)) { return nullptr; }
int32_t numRanges = inclusions->getRangeCount();
UChar32 start = 0;
uint32_t value = nullValue;
for (int32_t i = 0; i < numRanges; ++i) {
UChar32 rangeEnd = inclusions->getRangeEnd(i);
for (UChar32 c = inclusions->getRangeStart(i); c <= rangeEnd; ++c) {
// TODO: Get a UCharacterProperty.IntProperty to avoid the property dispatch.
uint32_t nextValue = u_getIntPropertyValue(c, property);
if (value != nextValue) {
if (value != nullValue) {
umutablecptrie_setRange(mutableTrie.getAlias(), start, c - 1, value, &errorCode);
}
start = c;
value = nextValue;
}
}
}
if (value != 0) {
umutablecptrie_setRange(mutableTrie.getAlias(), start, 0x10FFFF, value, &errorCode);
}
UCPTrieType type;
if (property == UCHAR_BIDI_CLASS || property == UCHAR_GENERAL_CATEGORY) {
type = UCPTRIE_TYPE_FAST;
} else {
type = UCPTRIE_TYPE_SMALL;
}
UCPTrieValueWidth valueWidth;
// TODO: UCharacterProperty.IntProperty
int32_t max = u_getIntPropertyMaxValue(property);
if (max <= 0xff) {
valueWidth = UCPTRIE_VALUE_BITS_8;
} else if (max <= 0xffff) {
valueWidth = UCPTRIE_VALUE_BITS_16;
} else {
valueWidth = UCPTRIE_VALUE_BITS_32;
}
return reinterpret_cast<UCPMap *>(
umutablecptrie_buildImmutable(mutableTrie.getAlias(), type, valueWidth, &errorCode));
}
} // namespace
U_NAMESPACE_BEGIN
const UnicodeSet *CharacterProperties::getBinaryPropertySet(UProperty property, UErrorCode &errorCode) {
if (U_FAILURE(errorCode)) { return nullptr; }
if (property < 0 || UCHAR_BINARY_LIMIT <= property) {
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
return nullptr;
}
Mutex m(&cpMutex);
UnicodeSet *set = sets[property];
if (set == nullptr) {
sets[property] = set = makeSet(property, errorCode);
}
return set;
}
U_NAMESPACE_END
U_NAMESPACE_USE
U_CAPI const USet * U_EXPORT2
u_getBinaryPropertySet(UProperty property, UErrorCode *pErrorCode) {
const UnicodeSet *set = CharacterProperties::getBinaryPropertySet(property, *pErrorCode);
return U_SUCCESS(*pErrorCode) ? set->toUSet() : nullptr;
}
U_CAPI const UCPMap * U_EXPORT2
u_getIntPropertyMap(UProperty property, UErrorCode *pErrorCode) {
if (U_FAILURE(*pErrorCode)) { return nullptr; }
if (property < UCHAR_INT_START || UCHAR_INT_LIMIT <= property) {
*pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
return nullptr;
}
Mutex m(&cpMutex);
UCPMap *map = maps[property - UCHAR_INT_START];
if (map == nullptr) {
maps[property - UCHAR_INT_START] = map = makeMap(property, *pErrorCode);
}
return map;
}

View file

@ -0,0 +1,100 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
**********************************************************************
* Copyright (C) 1999-2011, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*/
#include "unicode/chariter.h"
U_NAMESPACE_BEGIN
ForwardCharacterIterator::~ForwardCharacterIterator() {}
ForwardCharacterIterator::ForwardCharacterIterator()
: UObject()
{}
ForwardCharacterIterator::ForwardCharacterIterator(const ForwardCharacterIterator &other)
: UObject(other)
{}
CharacterIterator::CharacterIterator()
: textLength(0), pos(0), begin(0), end(0) {
}
CharacterIterator::CharacterIterator(int32_t length)
: textLength(length), pos(0), begin(0), end(length) {
if(textLength < 0) {
textLength = end = 0;
}
}
CharacterIterator::CharacterIterator(int32_t length, int32_t position)
: textLength(length), pos(position), begin(0), end(length) {
if(textLength < 0) {
textLength = end = 0;
}
if(pos < 0) {
pos = 0;
} else if(pos > end) {
pos = end;
}
}
CharacterIterator::CharacterIterator(int32_t length, int32_t textBegin, int32_t textEnd, int32_t position)
: textLength(length), pos(position), begin(textBegin), end(textEnd) {
if(textLength < 0) {
textLength = 0;
}
if(begin < 0) {
begin = 0;
} else if(begin > textLength) {
begin = textLength;
}
if(end < begin) {
end = begin;
} else if(end > textLength) {
end = textLength;
}
if(pos < begin) {
pos = begin;
} else if(pos > end) {
pos = end;
}
}
CharacterIterator::~CharacterIterator() {}
CharacterIterator::CharacterIterator(const CharacterIterator &that) :
ForwardCharacterIterator(that),
textLength(that.textLength), pos(that.pos), begin(that.begin), end(that.end)
{
}
CharacterIterator &
CharacterIterator::operator=(const CharacterIterator &that) {
ForwardCharacterIterator::operator=(that);
textLength = that.textLength;
pos = that.pos;
begin = that.begin;
end = that.end;
return *this;
}
// implementing first[32]PostInc() directly in a subclass should be faster
// but these implementations make subclassing a little easier
char16_t
CharacterIterator::firstPostInc() {
setToStart();
return nextPostInc();
}
UChar32
CharacterIterator::first32PostInc() {
setToStart();
return next32PostInc();
}
U_NAMESPACE_END

View file

@ -0,0 +1,273 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
* Copyright (C) 2010-2015, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* file name: charstr.cpp
* encoding: UTF-8
* tab size: 8 (not used)
* indentation:4
*
* created on: 2010may19
* created by: Markus W. Scherer
*/
#include <cstdlib>
#include "unicode/utypes.h"
#include "unicode/putil.h"
#include "charstr.h"
#include "cmemory.h"
#include "cstring.h"
#include "uinvchar.h"
#include "ustr_imp.h"
U_NAMESPACE_BEGIN
CharString::CharString(CharString&& src) noexcept
: buffer(std::move(src.buffer)), len(src.len) {
src.len = 0; // not strictly necessary because we make no guarantees on the source string
}
CharString& CharString::operator=(CharString&& src) noexcept {
buffer = std::move(src.buffer);
len = src.len;
src.len = 0; // not strictly necessary because we make no guarantees on the source string
return *this;
}
char *CharString::cloneData(UErrorCode &errorCode) const {
if (U_FAILURE(errorCode)) { return nullptr; }
char *p = static_cast<char *>(uprv_malloc(len + 1));
if (p == nullptr) {
errorCode = U_MEMORY_ALLOCATION_ERROR;
return nullptr;
}
uprv_memcpy(p, buffer.getAlias(), len + 1);
return p;
}
int32_t CharString::extract(char *dest, int32_t capacity, UErrorCode &errorCode) const {
if (U_FAILURE(errorCode)) { return len; }
if (capacity < 0 || (capacity > 0 && dest == nullptr)) {
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
return len;
}
const char *src = buffer.getAlias();
if (0 < len && len <= capacity && src != dest) {
uprv_memcpy(dest, src, len);
}
return u_terminateChars(dest, capacity, len, &errorCode);
}
CharString &CharString::copyFrom(const CharString &s, UErrorCode &errorCode) {
if(U_SUCCESS(errorCode) && this!=&s && ensureCapacity(s.len+1, 0, errorCode)) {
len=s.len;
uprv_memcpy(buffer.getAlias(), s.buffer.getAlias(), len+1);
}
return *this;
}
int32_t CharString::lastIndexOf(char c) const {
for(int32_t i=len; i>0;) {
if(buffer[--i]==c) {
return i;
}
}
return -1;
}
bool CharString::contains(StringPiece s) const {
if (s.empty()) { return false; }
const char *p = buffer.getAlias();
int32_t lastStart = len - s.length();
for (int32_t i = 0; i <= lastStart; ++i) {
if (uprv_memcmp(p + i, s.data(), s.length()) == 0) {
return true;
}
}
return false;
}
CharString &CharString::truncate(int32_t newLength) {
if(newLength<0) {
newLength=0;
}
if(newLength<len) {
buffer[len=newLength]=0;
}
return *this;
}
CharString &CharString::append(char c, UErrorCode &errorCode) {
if(ensureCapacity(len+2, 0, errorCode)) {
buffer[len++]=c;
buffer[len]=0;
}
return *this;
}
CharString &CharString::append(const char *s, int32_t sLength, UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) {
return *this;
}
if(sLength<-1 || (s==nullptr && sLength!=0)) {
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
return *this;
}
if(sLength<0) {
sLength= static_cast<int32_t>(uprv_strlen(s));
}
if(sLength>0) {
if(s==(buffer.getAlias()+len)) {
// The caller wrote into the getAppendBuffer().
if(sLength>=(buffer.getCapacity()-len)) {
// The caller wrote too much.
errorCode=U_INTERNAL_PROGRAM_ERROR;
} else {
buffer[len+=sLength]=0;
}
} else if(buffer.getAlias()<=s && s<(buffer.getAlias()+len) &&
sLength>=(buffer.getCapacity()-len)
) {
// (Part of) this string is appended to itself which requires reallocation,
// so we have to make a copy of the substring and append that.
return append(CharString(s, sLength, errorCode), errorCode);
} else if(ensureCapacity(len+sLength+1, 0, errorCode)) {
uprv_memcpy(buffer.getAlias()+len, s, sLength);
buffer[len+=sLength]=0;
}
}
return *this;
}
CharString &CharString::appendNumber(int32_t number, UErrorCode &status) {
if (number < 0) {
this->append('-', status);
if (U_FAILURE(status)) {
return *this;
}
}
if (number == 0) {
this->append('0', status);
return *this;
}
int32_t numLen = 0;
while (number != 0) {
int32_t residue = number % 10;
number /= 10;
this->append(std::abs(residue) + '0', status);
numLen++;
if (U_FAILURE(status)) {
return *this;
}
}
int32_t start = this->length() - numLen, end = this->length() - 1;
while(start < end) {
std::swap(this->data()[start++], this->data()[end--]);
}
return *this;
}
char *CharString::getAppendBuffer(int32_t minCapacity,
int32_t desiredCapacityHint,
int32_t &resultCapacity,
UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) {
resultCapacity=0;
return nullptr;
}
int32_t appendCapacity=buffer.getCapacity()-len-1; // -1 for NUL
if(appendCapacity>=minCapacity) {
resultCapacity=appendCapacity;
return buffer.getAlias()+len;
}
if(ensureCapacity(len+minCapacity+1, len+desiredCapacityHint+1, errorCode)) {
resultCapacity=buffer.getCapacity()-len-1;
return buffer.getAlias()+len;
}
resultCapacity=0;
return nullptr;
}
CharString &CharString::appendInvariantChars(const UnicodeString &s, UErrorCode &errorCode) {
return appendInvariantChars(s.getBuffer(), s.length(), errorCode);
}
CharString &CharString::appendInvariantChars(const char16_t* uchars, int32_t ucharsLen, UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) {
return *this;
}
if (!uprv_isInvariantUString(uchars, ucharsLen)) {
errorCode = U_INVARIANT_CONVERSION_ERROR;
return *this;
}
if(ensureCapacity(len+ucharsLen+1, 0, errorCode)) {
u_UCharsToChars(uchars, buffer.getAlias()+len, ucharsLen);
len += ucharsLen;
buffer[len] = 0;
}
return *this;
}
UBool CharString::ensureCapacity(int32_t capacity,
int32_t desiredCapacityHint,
UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) {
return false;
}
if(capacity>buffer.getCapacity()) {
if(desiredCapacityHint==0) {
desiredCapacityHint=capacity+buffer.getCapacity();
}
if( (desiredCapacityHint<=capacity || buffer.resize(desiredCapacityHint, len+1)==nullptr) &&
buffer.resize(capacity, len+1)==nullptr
) {
errorCode=U_MEMORY_ALLOCATION_ERROR;
return false;
}
}
return true;
}
CharString &CharString::appendPathPart(StringPiece s, UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) {
return *this;
}
if(s.length()==0) {
return *this;
}
char c;
if(len>0 && (c=buffer[len-1])!=U_FILE_SEP_CHAR && c!=U_FILE_ALT_SEP_CHAR) {
append(getDirSepChar(), errorCode);
}
append(s, errorCode);
return *this;
}
CharString &CharString::ensureEndsWithFileSeparator(UErrorCode &errorCode) {
char c;
if(U_SUCCESS(errorCode) && len>0 &&
(c=buffer[len-1])!=U_FILE_SEP_CHAR && c!=U_FILE_ALT_SEP_CHAR) {
append(getDirSepChar(), errorCode);
}
return *this;
}
char CharString::getDirSepChar() const {
char dirSepChar = U_FILE_SEP_CHAR;
#if (U_FILE_SEP_CHAR != U_FILE_ALT_SEP_CHAR)
// We may need to return a different directory separator when building for Cygwin or MSYS2.
if(len>0 && !uprv_strchr(data(), U_FILE_SEP_CHAR) && uprv_strchr(data(), U_FILE_ALT_SEP_CHAR))
dirSepChar = U_FILE_ALT_SEP_CHAR;
#endif
return dirSepChar;
}
U_NAMESPACE_END

200
engine/thirdparty/icu4c/common/charstr.h vendored Normal file
View file

@ -0,0 +1,200 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
**********************************************************************
* Copyright (c) 2001-2015, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* Date Name Description
* 11/19/2001 aliu Creation.
* 05/19/2010 markus Rewritten from scratch
**********************************************************************
*/
#ifndef CHARSTRING_H
#define CHARSTRING_H
#include "unicode/utypes.h"
#include "unicode/unistr.h"
#include "unicode/uobject.h"
#include "cmemory.h"
U_NAMESPACE_BEGIN
// Windows needs us to DLL-export the MaybeStackArray template specialization,
// but MacOS X cannot handle it. Same as in digitlst.h.
#if !U_PLATFORM_IS_DARWIN_BASED
template class U_COMMON_API MaybeStackArray<char, 40>;
#endif
/**
* ICU-internal char * string class.
* This class does not assume or enforce any particular character encoding.
* Raw bytes can be stored. The string object owns its characters.
* A terminating NUL is stored, but the class does not prevent embedded NUL characters.
*
* This class wants to be convenient but is also deliberately minimalist.
* Please do not add methods if they only add minor convenience.
* For example:
* cs.data()[5]='a'; // no need for setCharAt(5, 'a')
*/
class U_COMMON_API CharString : public UMemory {
public:
CharString() : len(0) { buffer[0]=0; }
CharString(StringPiece s, UErrorCode &errorCode) : len(0) {
buffer[0]=0;
append(s, errorCode);
}
CharString(const CharString &s, UErrorCode &errorCode) : len(0) {
buffer[0]=0;
append(s, errorCode);
}
CharString(const char *s, int32_t sLength, UErrorCode &errorCode) : len(0) {
buffer[0]=0;
append(s, sLength, errorCode);
}
~CharString() {}
/**
* Move constructor; might leave src in an undefined state.
* This string will have the same contents and state that the source string had.
*/
CharString(CharString &&src) noexcept;
/**
* Move assignment operator; might leave src in an undefined state.
* This string will have the same contents and state that the source string had.
* The behavior is undefined if *this and src are the same object.
*/
CharString &operator=(CharString &&src) noexcept;
/**
* Replaces this string's contents with the other string's contents.
* CharString does not support the standard copy constructor nor
* the assignment operator, to make copies explicit and to
* use a UErrorCode where memory allocations might be needed.
*/
CharString &copyFrom(const CharString &other, UErrorCode &errorCode);
UBool isEmpty() const { return len==0; }
int32_t length() const { return len; }
char operator[](int32_t index) const { return buffer[index]; }
StringPiece toStringPiece() const { return StringPiece(buffer.getAlias(), len); }
const char *data() const { return buffer.getAlias(); }
char *data() { return buffer.getAlias(); }
/**
* Allocates length()+1 chars and copies the NUL-terminated data().
* The caller must uprv_free() the result.
*/
char *cloneData(UErrorCode &errorCode) const;
/**
* Copies the contents of the string into dest.
* Checks if there is enough space in dest, extracts the entire string if possible,
* and NUL-terminates dest if possible.
*
* If the string fits into dest but cannot be NUL-terminated (length()==capacity),
* then the error code is set to U_STRING_NOT_TERMINATED_WARNING.
* If the string itself does not fit into dest (length()>capacity),
* then the error code is set to U_BUFFER_OVERFLOW_ERROR.
*
* @param dest Destination string buffer.
* @param capacity Size of the dest buffer (number of chars).
* @param errorCode ICU error code.
* @return length()
*/
int32_t extract(char *dest, int32_t capacity, UErrorCode &errorCode) const;
bool operator==(const CharString& other) const {
return len == other.length() && (len == 0 || uprv_memcmp(data(), other.data(), len) == 0);
}
bool operator!=(const CharString& other) const {
return !operator==(other);
}
bool operator==(StringPiece other) const {
return len == other.length() && (len == 0 || uprv_memcmp(data(), other.data(), len) == 0);
}
bool operator!=(StringPiece other) const {
return !operator==(other);
}
/** @return last index of c, or -1 if c is not in this string */
int32_t lastIndexOf(char c) const;
bool contains(StringPiece s) const;
CharString &clear() { len=0; buffer[0]=0; return *this; }
CharString &truncate(int32_t newLength);
CharString &append(char c, UErrorCode &errorCode);
CharString &append(StringPiece s, UErrorCode &errorCode) {
return append(s.data(), s.length(), errorCode);
}
CharString &append(const CharString &s, UErrorCode &errorCode) {
return append(s.data(), s.length(), errorCode);
}
CharString &append(const char *s, int32_t sLength, UErrorCode &status);
CharString &appendNumber(int32_t number, UErrorCode &status);
/**
* Returns a writable buffer for appending and writes the buffer's capacity to
* resultCapacity. Guarantees resultCapacity>=minCapacity if U_SUCCESS().
* There will additionally be space for a terminating NUL right at resultCapacity.
* (This function is similar to ByteSink.GetAppendBuffer().)
*
* The returned buffer is only valid until the next write operation
* on this string.
*
* After writing at most resultCapacity bytes, call append() with the
* pointer returned from this function and the number of bytes written.
*
* @param minCapacity required minimum capacity of the returned buffer;
* must be non-negative
* @param desiredCapacityHint desired capacity of the returned buffer;
* must be non-negative
* @param resultCapacity will be set to the capacity of the returned buffer
* @param errorCode in/out error code
* @return a buffer with resultCapacity>=min_capacity
*/
char *getAppendBuffer(int32_t minCapacity,
int32_t desiredCapacityHint,
int32_t &resultCapacity,
UErrorCode &errorCode);
CharString &appendInvariantChars(const UnicodeString &s, UErrorCode &errorCode);
CharString &appendInvariantChars(const char16_t* uchars, int32_t ucharsLen, UErrorCode& errorCode);
/**
* Appends a filename/path part, e.g., a directory name.
* First appends a U_FILE_SEP_CHAR or U_FILE_ALT_SEP_CHAR if necessary.
* Does nothing if s is empty.
*/
CharString &appendPathPart(StringPiece s, UErrorCode &errorCode);
/**
* Appends a U_FILE_SEP_CHAR or U_FILE_ALT_SEP_CHAR if this string is not empty
* and does not already end with a U_FILE_SEP_CHAR or U_FILE_ALT_SEP_CHAR.
*/
CharString &ensureEndsWithFileSeparator(UErrorCode &errorCode);
private:
MaybeStackArray<char, 40> buffer;
int32_t len;
UBool ensureCapacity(int32_t capacity, int32_t desiredCapacityHint, UErrorCode &errorCode);
CharString(const CharString &other) = delete; // forbid copying of this class
CharString &operator=(const CharString &other) = delete; // forbid copying of this class
/**
* Returns U_FILE_ALT_SEP_CHAR if found in string, and U_FILE_SEP_CHAR is not found.
* Otherwise returns U_FILE_SEP_CHAR.
*/
char getDirSepChar() const;
};
U_NAMESPACE_END
#endif
//eof

View file

@ -0,0 +1,55 @@
// © 2020 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
// charstrmap.h
// created: 2020sep01 Frank Yung-Fong Tang
#ifndef __CHARSTRMAP_H__
#define __CHARSTRMAP_H__
#include <utility>
#include "unicode/utypes.h"
#include "unicode/uobject.h"
#include "uhash.h"
U_NAMESPACE_BEGIN
/**
* Map of const char * keys & values.
* Stores pointers as is: Does not own/copy/adopt/release strings.
*/
class CharStringMap final : public UMemory {
public:
/** Constructs an unusable non-map. */
CharStringMap() : map(nullptr) {}
CharStringMap(int32_t size, UErrorCode &errorCode) {
map = uhash_openSize(uhash_hashChars, uhash_compareChars, uhash_compareChars,
size, &errorCode);
}
CharStringMap(CharStringMap &&other) noexcept : map(other.map) {
other.map = nullptr;
}
CharStringMap(const CharStringMap &other) = delete;
~CharStringMap() {
uhash_close(map);
}
CharStringMap &operator=(CharStringMap &&other) noexcept {
map = other.map;
other.map = nullptr;
return *this;
}
CharStringMap &operator=(const CharStringMap &other) = delete;
const char *get(const char *key) const { return static_cast<const char *>(uhash_get(map, key)); }
void put(const char *key, const char *value, UErrorCode &errorCode) {
uhash_put(map, const_cast<char *>(key), const_cast<char *>(value), &errorCode);
}
private:
UHashtable *map;
};
U_NAMESPACE_END
#endif // __CHARSTRMAP_H__

View file

@ -0,0 +1,138 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
******************************************************************************
*
* Copyright (C) 2002-2015, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
*
* File cmemory.c ICU Heap allocation.
* All ICU heap allocation, both for C and C++ new of ICU
* class types, comes through these functions.
*
* If you have a need to replace ICU allocation, this is the
* place to do it.
*
* Note that uprv_malloc(0) returns a non-nullptr pointer,
* and that a subsequent free of that pointer value is a NOP.
*
******************************************************************************
*/
#include "unicode/uclean.h"
#include "cmemory.h"
#include "putilimp.h"
#include "uassert.h"
#include <stdlib.h>
/* uprv_malloc(0) returns a pointer to this read-only data. */
static const int32_t zeroMem[] = {0, 0, 0, 0, 0, 0};
/* Function Pointers for user-supplied heap functions */
static const void *pContext;
static UMemAllocFn *pAlloc;
static UMemReallocFn *pRealloc;
static UMemFreeFn *pFree;
#if U_DEBUG && defined(UPRV_MALLOC_COUNT)
#include <stdio.h>
static int n=0;
static long b=0;
#endif
U_CAPI void * U_EXPORT2
uprv_malloc(size_t s) {
#if U_DEBUG && defined(UPRV_MALLOC_COUNT)
#if 1
putchar('>');
fflush(stdout);
#else
fprintf(stderr,"MALLOC\t#%d\t%ul bytes\t%ul total\n", ++n,s,(b+=s)); fflush(stderr);
#endif
#endif
if (s > 0) {
if (pAlloc) {
return (*pAlloc)(pContext, s);
} else {
return uprv_default_malloc(s);
}
} else {
return (void *)zeroMem;
}
}
U_CAPI void * U_EXPORT2
uprv_realloc(void * buffer, size_t size) {
#if U_DEBUG && defined(UPRV_MALLOC_COUNT)
putchar('~');
fflush(stdout);
#endif
if (buffer == zeroMem) {
return uprv_malloc(size);
} else if (size == 0) {
if (pFree) {
(*pFree)(pContext, buffer);
} else {
uprv_default_free(buffer);
}
return (void *)zeroMem;
} else {
if (pRealloc) {
return (*pRealloc)(pContext, buffer, size);
} else {
return uprv_default_realloc(buffer, size);
}
}
}
U_CAPI void U_EXPORT2
uprv_free(void *buffer) {
#if U_DEBUG && defined(UPRV_MALLOC_COUNT)
putchar('<');
fflush(stdout);
#endif
if (buffer != zeroMem) {
if (pFree) {
(*pFree)(pContext, buffer);
} else {
uprv_default_free(buffer);
}
}
}
U_CAPI void * U_EXPORT2
uprv_calloc(size_t num, size_t size) {
void *mem = nullptr;
size *= num;
mem = uprv_malloc(size);
if (mem) {
uprv_memset(mem, 0, size);
}
return mem;
}
U_CAPI void U_EXPORT2
u_setMemoryFunctions(const void *context, UMemAllocFn *a, UMemReallocFn *r, UMemFreeFn *f, UErrorCode *status)
{
if (U_FAILURE(*status)) {
return;
}
if (a==nullptr || r==nullptr || f==nullptr) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
pContext = context;
pAlloc = a;
pRealloc = r;
pFree = f;
}
U_CFUNC UBool cmemory_cleanup() {
pContext = nullptr;
pAlloc = nullptr;
pRealloc = nullptr;
pFree = nullptr;
return true;
}

900
engine/thirdparty/icu4c/common/cmemory.h vendored Normal file
View file

@ -0,0 +1,900 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
******************************************************************************
*
* Copyright (C) 1997-2016, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
*
* File CMEMORY.H
*
* Contains stdlib.h/string.h memory functions
*
* @author Bertrand A. Damiba
*
* Modification History:
*
* Date Name Description
* 6/20/98 Bertrand Created.
* 05/03/99 stephen Changed from functions to macros.
*
******************************************************************************
*/
#ifndef CMEMORY_H
#define CMEMORY_H
#include "unicode/utypes.h"
#include <stddef.h>
#include <string.h>
#include "unicode/localpointer.h"
#include "uassert.h"
#if U_DEBUG && defined(UPRV_MALLOC_COUNT)
#include <stdio.h>
#endif
// uprv_memcpy and uprv_memmove
#if defined(__clang__)
#define uprv_memcpy(dst, src, size) UPRV_BLOCK_MACRO_BEGIN { \
/* Suppress warnings about addresses that will never be NULL */ \
_Pragma("clang diagnostic push") \
_Pragma("clang diagnostic ignored \"-Waddress\"") \
U_ASSERT(dst != NULL); \
U_ASSERT(src != NULL); \
_Pragma("clang diagnostic pop") \
U_STANDARD_CPP_NAMESPACE memcpy(dst, src, size); \
} UPRV_BLOCK_MACRO_END
#define uprv_memmove(dst, src, size) UPRV_BLOCK_MACRO_BEGIN { \
/* Suppress warnings about addresses that will never be NULL */ \
_Pragma("clang diagnostic push") \
_Pragma("clang diagnostic ignored \"-Waddress\"") \
U_ASSERT(dst != NULL); \
U_ASSERT(src != NULL); \
_Pragma("clang diagnostic pop") \
U_STANDARD_CPP_NAMESPACE memmove(dst, src, size); \
} UPRV_BLOCK_MACRO_END
#elif defined(__GNUC__)
#define uprv_memcpy(dst, src, size) UPRV_BLOCK_MACRO_BEGIN { \
/* Suppress warnings about addresses that will never be NULL */ \
_Pragma("GCC diagnostic push") \
_Pragma("GCC diagnostic ignored \"-Waddress\"") \
U_ASSERT(dst != NULL); \
U_ASSERT(src != NULL); \
_Pragma("GCC diagnostic pop") \
U_STANDARD_CPP_NAMESPACE memcpy(dst, src, size); \
} UPRV_BLOCK_MACRO_END
#define uprv_memmove(dst, src, size) UPRV_BLOCK_MACRO_BEGIN { \
/* Suppress warnings about addresses that will never be NULL */ \
_Pragma("GCC diagnostic push") \
_Pragma("GCC diagnostic ignored \"-Waddress\"") \
U_ASSERT(dst != NULL); \
U_ASSERT(src != NULL); \
_Pragma("GCC diagnostic pop") \
U_STANDARD_CPP_NAMESPACE memmove(dst, src, size); \
} UPRV_BLOCK_MACRO_END
#else
#define uprv_memcpy(dst, src, size) UPRV_BLOCK_MACRO_BEGIN { \
U_ASSERT(dst != NULL); \
U_ASSERT(src != NULL); \
U_STANDARD_CPP_NAMESPACE memcpy(dst, src, size); \
} UPRV_BLOCK_MACRO_END
#define uprv_memmove(dst, src, size) UPRV_BLOCK_MACRO_BEGIN { \
U_ASSERT(dst != NULL); \
U_ASSERT(src != NULL); \
U_STANDARD_CPP_NAMESPACE memmove(dst, src, size); \
} UPRV_BLOCK_MACRO_END
#endif
/**
* \def UPRV_LENGTHOF
* Convenience macro to determine the length of a fixed array at compile-time.
* @param array A fixed length array
* @return The length of the array, in elements
* @internal
*/
#define UPRV_LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
#define uprv_memset(buffer, mark, size) U_STANDARD_CPP_NAMESPACE memset(buffer, mark, size)
#define uprv_memcmp(buffer1, buffer2, size) U_STANDARD_CPP_NAMESPACE memcmp(buffer1, buffer2,size)
#define uprv_memchr(ptr, value, num) U_STANDARD_CPP_NAMESPACE memchr(ptr, value, num)
U_CAPI void * U_EXPORT2
uprv_malloc(size_t s) U_MALLOC_ATTR U_ALLOC_SIZE_ATTR(1);
U_CAPI void * U_EXPORT2
uprv_realloc(void *mem, size_t size) U_ALLOC_SIZE_ATTR(2);
U_CAPI void U_EXPORT2
uprv_free(void *mem);
U_CAPI void * U_EXPORT2
uprv_calloc(size_t num, size_t size) U_MALLOC_ATTR U_ALLOC_SIZE_ATTR2(1,2);
/**
* Get the least significant bits of a pointer (a memory address).
* For example, with a mask of 3, the macro gets the 2 least significant bits,
* which will be 0 if the pointer is 32-bit (4-byte) aligned.
*
* uintptr_t is the most appropriate integer type to cast to.
*/
#define U_POINTER_MASK_LSB(ptr, mask) ((uintptr_t)(ptr) & (mask))
/**
* Create & return an instance of "type" in statically allocated storage.
* e.g.
* static std::mutex *myMutex = STATIC_NEW(std::mutex);
* To destroy an object created in this way, invoke the destructor explicitly, e.g.
* myMutex->~mutex();
* DO NOT use delete.
* DO NOT use with class UMutex, which has specific support for static instances.
*
* STATIC_NEW is intended for use when
* - We want a static (or global) object.
* - We don't want it to ever be destructed, or to explicitly control destruction,
* to avoid use-after-destruction problems.
* - We want to avoid an ordinary heap allocated object,
* to avoid the possibility of memory allocation failures, and
* to avoid memory leak reports, from valgrind, for example.
* This is defined as a macro rather than a template function because each invocation
* must define distinct static storage for the object being returned.
*/
#define STATIC_NEW(type) [] () { \
alignas(type) static char storage[sizeof(type)]; \
return new(storage) type();} ()
/**
* Heap clean up function, called from u_cleanup()
* Clears any user heap functions from u_setMemoryFunctions()
* Does NOT deallocate any remaining allocated memory.
*/
U_CFUNC UBool
cmemory_cleanup(void);
/**
* A function called by <TT>uhash_remove</TT>,
* <TT>uhash_close</TT>, or <TT>uhash_put</TT> to delete
* an existing key or value.
* @param obj A key or value stored in a hashtable
* @see uprv_deleteUObject
*/
typedef void U_CALLCONV UObjectDeleter(void* obj);
/**
* Deleter for UObject instances.
* Works for all subclasses of UObject because it has a virtual destructor.
*/
U_CAPI void U_EXPORT2
uprv_deleteUObject(void *obj);
#ifdef __cplusplus
#include <utility>
#include "unicode/uobject.h"
U_NAMESPACE_BEGIN
/**
* "Smart pointer" class, deletes memory via uprv_free().
* For most methods see the LocalPointerBase base class.
* Adds operator[] for array item access.
*
* @see LocalPointerBase
*/
template<typename T>
class LocalMemory : public LocalPointerBase<T> {
public:
using LocalPointerBase<T>::operator*;
using LocalPointerBase<T>::operator->;
/**
* Constructor takes ownership.
* @param p simple pointer to an array of T items that is adopted
*/
explicit LocalMemory(T *p=nullptr) : LocalPointerBase<T>(p) {}
/**
* Move constructor, leaves src with isNull().
* @param src source smart pointer
*/
LocalMemory(LocalMemory<T> &&src) noexcept : LocalPointerBase<T>(src.ptr) {
src.ptr=nullptr;
}
/**
* Destructor deletes the memory it owns.
*/
~LocalMemory() {
uprv_free(LocalPointerBase<T>::ptr);
}
/**
* Move assignment operator, leaves src with isNull().
* The behavior is undefined if *this and src are the same object.
* @param src source smart pointer
* @return *this
*/
LocalMemory<T> &operator=(LocalMemory<T> &&src) noexcept {
uprv_free(LocalPointerBase<T>::ptr);
LocalPointerBase<T>::ptr=src.ptr;
src.ptr=nullptr;
return *this;
}
/**
* Swap pointers.
* @param other other smart pointer
*/
void swap(LocalMemory<T> &other) noexcept {
T *temp=LocalPointerBase<T>::ptr;
LocalPointerBase<T>::ptr=other.ptr;
other.ptr=temp;
}
/**
* Non-member LocalMemory swap function.
* @param p1 will get p2's pointer
* @param p2 will get p1's pointer
*/
friend inline void swap(LocalMemory<T> &p1, LocalMemory<T> &p2) noexcept {
p1.swap(p2);
}
/**
* Deletes the array it owns,
* and adopts (takes ownership of) the one passed in.
* @param p simple pointer to an array of T items that is adopted
*/
void adoptInstead(T *p) {
uprv_free(LocalPointerBase<T>::ptr);
LocalPointerBase<T>::ptr=p;
}
/**
* Deletes the array it owns, allocates a new one and reset its bytes to 0.
* Returns the new array pointer.
* If the allocation fails, then the current array is unchanged and
* this method returns nullptr.
* @param newCapacity must be >0
* @return the allocated array pointer, or nullptr if the allocation failed
*/
inline T *allocateInsteadAndReset(int32_t newCapacity=1);
/**
* Deletes the array it owns and allocates a new one, copying length T items.
* Returns the new array pointer.
* If the allocation fails, then the current array is unchanged and
* this method returns nullptr.
* @param newCapacity must be >0
* @param length number of T items to be copied from the old array to the new one;
* must be no more than the capacity of the old array,
* which the caller must track because the LocalMemory does not track it
* @return the allocated array pointer, or nullptr if the allocation failed
*/
inline T *allocateInsteadAndCopy(int32_t newCapacity=1, int32_t length=0);
/**
* Array item access (writable).
* No index bounds check.
* @param i array index
* @return reference to the array item
*/
T &operator[](ptrdiff_t i) const { return LocalPointerBase<T>::ptr[i]; }
};
template<typename T>
inline T *LocalMemory<T>::allocateInsteadAndReset(int32_t newCapacity) {
if(newCapacity>0) {
T *p=(T *)uprv_malloc(newCapacity*sizeof(T));
if(p!=nullptr) {
uprv_memset(p, 0, newCapacity*sizeof(T));
uprv_free(LocalPointerBase<T>::ptr);
LocalPointerBase<T>::ptr=p;
}
return p;
} else {
return nullptr;
}
}
template<typename T>
inline T *LocalMemory<T>::allocateInsteadAndCopy(int32_t newCapacity, int32_t length) {
if(newCapacity>0) {
T *p=(T *)uprv_malloc(newCapacity*sizeof(T));
if(p!=nullptr) {
if(length>0) {
if(length>newCapacity) {
length=newCapacity;
}
uprv_memcpy(p, LocalPointerBase<T>::ptr, (size_t)length*sizeof(T));
}
uprv_free(LocalPointerBase<T>::ptr);
LocalPointerBase<T>::ptr=p;
}
return p;
} else {
return nullptr;
}
}
/**
* Simple array/buffer management class using uprv_malloc() and uprv_free().
* Provides an internal array with fixed capacity. Can alias another array
* or allocate one.
*
* The array address is properly aligned for type T. It might not be properly
* aligned for types larger than T (or larger than the largest subtype of T).
*
* Unlike LocalMemory and LocalArray, this class never adopts
* (takes ownership of) another array.
*
* WARNING: MaybeStackArray only works with primitive (plain-old data) types.
* It does NOT know how to call a destructor! If you work with classes with
* destructors, consider:
*
* - LocalArray in localpointer.h if you know the length ahead of time
* - MaybeStackVector if you know the length at runtime
*/
template<typename T, int32_t stackCapacity>
class MaybeStackArray {
public:
// No heap allocation. Use only on the stack.
static void* U_EXPORT2 operator new(size_t) noexcept = delete;
static void* U_EXPORT2 operator new[](size_t) noexcept = delete;
#if U_HAVE_PLACEMENT_NEW
static void* U_EXPORT2 operator new(size_t, void*) noexcept = delete;
#endif
/**
* Default constructor initializes with internal T[stackCapacity] buffer.
*/
MaybeStackArray() : ptr(stackArray), capacity(stackCapacity), needToRelease(false) {}
/**
* Automatically allocates the heap array if the argument is larger than the stack capacity.
* Intended for use when an approximate capacity is known at compile time but the true
* capacity is not known until runtime.
*/
MaybeStackArray(int32_t newCapacity, UErrorCode status) : MaybeStackArray() {
if (U_FAILURE(status)) {
return;
}
if (capacity < newCapacity) {
if (resize(newCapacity) == nullptr) {
status = U_MEMORY_ALLOCATION_ERROR;
}
}
}
/**
* Destructor deletes the array (if owned).
*/
~MaybeStackArray() { releaseArray(); }
/**
* Move constructor: transfers ownership or copies the stack array.
*/
MaybeStackArray(MaybeStackArray<T, stackCapacity> &&src) noexcept;
/**
* Move assignment: transfers ownership or copies the stack array.
*/
MaybeStackArray<T, stackCapacity> &operator=(MaybeStackArray<T, stackCapacity> &&src) noexcept;
/**
* Returns the array capacity (number of T items).
* @return array capacity
*/
int32_t getCapacity() const { return capacity; }
/**
* Access without ownership change.
* @return the array pointer
*/
T *getAlias() const { return ptr; }
/**
* Returns the array limit. Simple convenience method.
* @return getAlias()+getCapacity()
*/
T *getArrayLimit() const { return getAlias()+capacity; }
// No "operator T *() const" because that can make
// expressions like mbs[index] ambiguous for some compilers.
/**
* Array item access (const).
* No index bounds check.
* @param i array index
* @return reference to the array item
*/
const T &operator[](ptrdiff_t i) const { return ptr[i]; }
/**
* Array item access (writable).
* No index bounds check.
* @param i array index
* @return reference to the array item
*/
T &operator[](ptrdiff_t i) { return ptr[i]; }
/**
* Deletes the array (if owned) and aliases another one, no transfer of ownership.
* If the arguments are illegal, then the current array is unchanged.
* @param otherArray must not be nullptr
* @param otherCapacity must be >0
*/
void aliasInstead(T *otherArray, int32_t otherCapacity) {
if(otherArray!=nullptr && otherCapacity>0) {
releaseArray();
ptr=otherArray;
capacity=otherCapacity;
needToRelease=false;
}
}
/**
* Deletes the array (if owned) and allocates a new one, copying length T items.
* Returns the new array pointer.
* If the allocation fails, then the current array is unchanged and
* this method returns nullptr.
* @param newCapacity can be less than or greater than the current capacity;
* must be >0
* @param length number of T items to be copied from the old array to the new one
* @return the allocated array pointer, or nullptr if the allocation failed
*/
inline T *resize(int32_t newCapacity, int32_t length=0);
/**
* Gives up ownership of the array if owned, or else clones it,
* copying length T items; resets itself to the internal stack array.
* Returns nullptr if the allocation failed.
* @param length number of T items to copy when cloning,
* and capacity of the clone when cloning
* @param resultCapacity will be set to the returned array's capacity (output-only)
* @return the array pointer;
* caller becomes responsible for deleting the array
*/
inline T *orphanOrClone(int32_t length, int32_t &resultCapacity);
protected:
// Resizes the array to the size of src, then copies the contents of src.
void copyFrom(const MaybeStackArray &src, UErrorCode &status) {
if (U_FAILURE(status)) {
return;
}
if (this->resize(src.capacity, 0) == nullptr) {
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
uprv_memcpy(this->ptr, src.ptr, (size_t)capacity * sizeof(T));
}
private:
T *ptr;
int32_t capacity;
UBool needToRelease;
T stackArray[stackCapacity];
void releaseArray() {
if(needToRelease) {
uprv_free(ptr);
}
}
void resetToStackArray() {
ptr=stackArray;
capacity=stackCapacity;
needToRelease=false;
}
/* No comparison operators with other MaybeStackArray's. */
bool operator==(const MaybeStackArray & /*other*/) = delete;
bool operator!=(const MaybeStackArray & /*other*/) = delete;
/* No ownership transfer: No copy constructor, no assignment operator. */
MaybeStackArray(const MaybeStackArray & /*other*/) = delete;
void operator=(const MaybeStackArray & /*other*/) = delete;
};
template<typename T, int32_t stackCapacity>
icu::MaybeStackArray<T, stackCapacity>::MaybeStackArray(
MaybeStackArray <T, stackCapacity>&& src) noexcept
: ptr(src.ptr), capacity(src.capacity), needToRelease(src.needToRelease) {
if (src.ptr == src.stackArray) {
ptr = stackArray;
uprv_memcpy(stackArray, src.stackArray, sizeof(T) * src.capacity);
} else {
src.resetToStackArray(); // take ownership away from src
}
}
template<typename T, int32_t stackCapacity>
inline MaybeStackArray <T, stackCapacity>&
MaybeStackArray<T, stackCapacity>::operator=(MaybeStackArray <T, stackCapacity>&& src) noexcept {
releaseArray(); // in case this instance had its own memory allocated
capacity = src.capacity;
needToRelease = src.needToRelease;
if (src.ptr == src.stackArray) {
ptr = stackArray;
uprv_memcpy(stackArray, src.stackArray, sizeof(T) * src.capacity);
} else {
ptr = src.ptr;
src.resetToStackArray(); // take ownership away from src
}
return *this;
}
template<typename T, int32_t stackCapacity>
inline T *MaybeStackArray<T, stackCapacity>::resize(int32_t newCapacity, int32_t length) {
if(newCapacity>0) {
#if U_DEBUG && defined(UPRV_MALLOC_COUNT)
::fprintf(::stderr, "MaybeStackArray (resize) alloc %d * %lu\n", newCapacity, sizeof(T));
#endif
T *p=(T *)uprv_malloc(newCapacity*sizeof(T));
if(p!=nullptr) {
if(length>0) {
if(length>capacity) {
length=capacity;
}
if(length>newCapacity) {
length=newCapacity;
}
uprv_memcpy(p, ptr, (size_t)length*sizeof(T));
}
releaseArray();
ptr=p;
capacity=newCapacity;
needToRelease=true;
}
return p;
} else {
return nullptr;
}
}
template<typename T, int32_t stackCapacity>
inline T *MaybeStackArray<T, stackCapacity>::orphanOrClone(int32_t length, int32_t &resultCapacity) {
T *p;
if(needToRelease) {
p=ptr;
} else if(length<=0) {
return nullptr;
} else {
if(length>capacity) {
length=capacity;
}
p=(T *)uprv_malloc(length*sizeof(T));
#if U_DEBUG && defined(UPRV_MALLOC_COUNT)
::fprintf(::stderr,"MaybeStacArray (orphan) alloc %d * %lu\n", length,sizeof(T));
#endif
if(p==nullptr) {
return nullptr;
}
uprv_memcpy(p, ptr, (size_t)length*sizeof(T));
}
resultCapacity=length;
resetToStackArray();
return p;
}
/**
* Variant of MaybeStackArray that allocates a header struct and an array
* in one contiguous memory block, using uprv_malloc() and uprv_free().
* Provides internal memory with fixed array capacity. Can alias another memory
* block or allocate one.
* The stackCapacity is the number of T items in the internal memory,
* not counting the H header.
* Unlike LocalMemory and LocalArray, this class never adopts
* (takes ownership of) another memory block.
*/
template<typename H, typename T, int32_t stackCapacity>
class MaybeStackHeaderAndArray {
public:
// No heap allocation. Use only on the stack.
static void* U_EXPORT2 operator new(size_t) noexcept = delete;
static void* U_EXPORT2 operator new[](size_t) noexcept = delete;
#if U_HAVE_PLACEMENT_NEW
static void* U_EXPORT2 operator new(size_t, void*) noexcept = delete;
#endif
/**
* Default constructor initializes with internal H+T[stackCapacity] buffer.
*/
MaybeStackHeaderAndArray() : ptr(&stackHeader), capacity(stackCapacity), needToRelease(false) {}
/**
* Destructor deletes the memory (if owned).
*/
~MaybeStackHeaderAndArray() { releaseMemory(); }
/**
* Returns the array capacity (number of T items).
* @return array capacity
*/
int32_t getCapacity() const { return capacity; }
/**
* Access without ownership change.
* @return the header pointer
*/
H *getAlias() const { return ptr; }
/**
* Returns the array start.
* @return array start, same address as getAlias()+1
*/
T *getArrayStart() const { return reinterpret_cast<T *>(getAlias()+1); }
/**
* Returns the array limit.
* @return array limit
*/
T *getArrayLimit() const { return getArrayStart()+capacity; }
/**
* Access without ownership change. Same as getAlias().
* A class instance can be used directly in expressions that take a T *.
* @return the header pointer
*/
operator H *() const { return ptr; }
/**
* Array item access (writable).
* No index bounds check.
* @param i array index
* @return reference to the array item
*/
T &operator[](ptrdiff_t i) { return getArrayStart()[i]; }
/**
* Deletes the memory block (if owned) and aliases another one, no transfer of ownership.
* If the arguments are illegal, then the current memory is unchanged.
* @param otherArray must not be nullptr
* @param otherCapacity must be >0
*/
void aliasInstead(H *otherMemory, int32_t otherCapacity) {
if(otherMemory!=nullptr && otherCapacity>0) {
releaseMemory();
ptr=otherMemory;
capacity=otherCapacity;
needToRelease=false;
}
}
/**
* Deletes the memory block (if owned) and allocates a new one,
* copying the header and length T array items.
* Returns the new header pointer.
* If the allocation fails, then the current memory is unchanged and
* this method returns nullptr.
* @param newCapacity can be less than or greater than the current capacity;
* must be >0
* @param length number of T items to be copied from the old array to the new one
* @return the allocated pointer, or nullptr if the allocation failed
*/
inline H *resize(int32_t newCapacity, int32_t length=0);
/**
* Gives up ownership of the memory if owned, or else clones it,
* copying the header and length T array items; resets itself to the internal memory.
* Returns nullptr if the allocation failed.
* @param length number of T items to copy when cloning,
* and array capacity of the clone when cloning
* @param resultCapacity will be set to the returned array's capacity (output-only)
* @return the header pointer;
* caller becomes responsible for deleting the array
*/
inline H *orphanOrClone(int32_t length, int32_t &resultCapacity);
private:
H *ptr;
int32_t capacity;
UBool needToRelease;
// stackHeader must precede stackArray immediately.
H stackHeader;
T stackArray[stackCapacity];
void releaseMemory() {
if(needToRelease) {
uprv_free(ptr);
}
}
/* No comparison operators with other MaybeStackHeaderAndArray's. */
bool operator==(const MaybeStackHeaderAndArray & /*other*/) {return false;}
bool operator!=(const MaybeStackHeaderAndArray & /*other*/) {return true;}
/* No ownership transfer: No copy constructor, no assignment operator. */
MaybeStackHeaderAndArray(const MaybeStackHeaderAndArray & /*other*/) {}
void operator=(const MaybeStackHeaderAndArray & /*other*/) {}
};
template<typename H, typename T, int32_t stackCapacity>
inline H *MaybeStackHeaderAndArray<H, T, stackCapacity>::resize(int32_t newCapacity,
int32_t length) {
if(newCapacity>=0) {
#if U_DEBUG && defined(UPRV_MALLOC_COUNT)
::fprintf(::stderr,"MaybeStackHeaderAndArray alloc %d + %d * %ul\n", sizeof(H),newCapacity,sizeof(T));
#endif
H *p=(H *)uprv_malloc(sizeof(H)+newCapacity*sizeof(T));
if(p!=nullptr) {
if(length<0) {
length=0;
} else if(length>0) {
if(length>capacity) {
length=capacity;
}
if(length>newCapacity) {
length=newCapacity;
}
}
uprv_memcpy(p, ptr, sizeof(H)+(size_t)length*sizeof(T));
releaseMemory();
ptr=p;
capacity=newCapacity;
needToRelease=true;
}
return p;
} else {
return nullptr;
}
}
template<typename H, typename T, int32_t stackCapacity>
inline H *MaybeStackHeaderAndArray<H, T, stackCapacity>::orphanOrClone(int32_t length,
int32_t &resultCapacity) {
H *p;
if(needToRelease) {
p=ptr;
} else {
if(length<0) {
length=0;
} else if(length>capacity) {
length=capacity;
}
#if U_DEBUG && defined(UPRV_MALLOC_COUNT)
::fprintf(::stderr,"MaybeStackHeaderAndArray (orphan) alloc %ul + %d * %lu\n", sizeof(H),length,sizeof(T));
#endif
p=(H *)uprv_malloc(sizeof(H)+length*sizeof(T));
if(p==nullptr) {
return nullptr;
}
uprv_memcpy(p, ptr, sizeof(H)+(size_t)length*sizeof(T));
}
resultCapacity=length;
ptr=&stackHeader;
capacity=stackCapacity;
needToRelease=false;
return p;
}
/**
* A simple memory management class that creates new heap allocated objects (of
* any class that has a public constructor), keeps track of them and eventually
* deletes them all in its own destructor.
*
* A typical use-case would be code like this:
*
* MemoryPool<MyType> pool;
*
* MyType* o1 = pool.create();
* if (o1 != nullptr) {
* foo(o1);
* }
*
* MyType* o2 = pool.create(1, 2, 3);
* if (o2 != nullptr) {
* bar(o2);
* }
*
* // MemoryPool will take care of deleting the MyType objects.
*
* It doesn't do anything more than that, and is intentionally kept minimalist.
*/
template<typename T, int32_t stackCapacity = 8>
class MemoryPool : public UMemory {
public:
MemoryPool() : fCount(0), fPool() {}
~MemoryPool() {
for (int32_t i = 0; i < fCount; ++i) {
delete fPool[i];
}
}
MemoryPool(const MemoryPool&) = delete;
MemoryPool& operator=(const MemoryPool&) = delete;
MemoryPool(MemoryPool&& other) noexcept : fCount(other.fCount),
fPool(std::move(other.fPool)) {
other.fCount = 0;
}
MemoryPool& operator=(MemoryPool&& other) noexcept {
// Since `this` may contain instances that need to be deleted, we can't
// just throw them away and replace them with `other`. The normal way of
// dealing with this in C++ is to swap `this` and `other`, rather than
// simply overwrite: the destruction of `other` can then take care of
// running MemoryPool::~MemoryPool() over the still-to-be-deallocated
// instances.
std::swap(fCount, other.fCount);
std::swap(fPool, other.fPool);
return *this;
}
/**
* Creates a new object of typename T, by forwarding any and all arguments
* to the typename T constructor.
*
* @param args Arguments to be forwarded to the typename T constructor.
* @return A pointer to the newly created object, or nullptr on error.
*/
template<typename... Args>
T* create(Args&&... args) {
int32_t capacity = fPool.getCapacity();
if (fCount == capacity &&
fPool.resize(capacity == stackCapacity ? 4 * capacity : 2 * capacity,
capacity) == nullptr) {
return nullptr;
}
return fPool[fCount++] = new T(std::forward<Args>(args)...);
}
template <typename... Args>
T* createAndCheckErrorCode(UErrorCode &status, Args &&... args) {
if (U_FAILURE(status)) {
return nullptr;
}
T *pointer = this->create(args...);
if (U_SUCCESS(status) && pointer == nullptr) {
status = U_MEMORY_ALLOCATION_ERROR;
}
return pointer;
}
/**
* @return Number of elements that have been allocated.
*/
int32_t count() const {
return fCount;
}
protected:
int32_t fCount;
MaybeStackArray<T*, stackCapacity> fPool;
};
/**
* An internal Vector-like implementation based on MemoryPool.
*
* Heap-allocates each element and stores pointers.
*
* To append an item to the vector, use emplaceBack.
*
* MaybeStackVector<MyType> vector;
* MyType* element = vector.emplaceBack();
* if (!element) {
* status = U_MEMORY_ALLOCATION_ERROR;
* }
* // do stuff with element
*
* To loop over the vector, use a for loop with indices:
*
* for (int32_t i = 0; i < vector.length(); i++) {
* MyType* element = vector[i];
* }
*/
template<typename T, int32_t stackCapacity = 8>
class MaybeStackVector : protected MemoryPool<T, stackCapacity> {
public:
template<typename... Args>
T* emplaceBack(Args&&... args) {
return this->create(args...);
}
template <typename... Args>
T *emplaceBackAndCheckErrorCode(UErrorCode &status, Args &&... args) {
return this->createAndCheckErrorCode(status, args...);
}
int32_t length() const {
return this->fCount;
}
T** getAlias() {
return this->fPool.getAlias();
}
const T *const *getAlias() const {
return this->fPool.getAlias();
}
/**
* Array item access (read-only).
* No index bounds check.
* @param i array index
* @return reference to the array item
*/
const T* operator[](ptrdiff_t i) const {
return this->fPool[i];
}
/**
* Array item access (writable).
* No index bounds check.
* @param i array index
* @return reference to the array item
*/
T* operator[](ptrdiff_t i) {
return this->fPool[i];
}
};
U_NAMESPACE_END
#endif /* __cplusplus */
#endif /* CMEMORY_H */

View file

@ -0,0 +1,97 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
******************************************************************************
*
* Copyright (C) 1997-2011, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
* file name: cpputils.h
* encoding: UTF-8
* tab size: 8 (not used)
* indentation:4
*/
#ifndef CPPUTILS_H
#define CPPUTILS_H
#include "unicode/utypes.h"
#include "unicode/unistr.h"
#include "cmemory.h"
/*==========================================================================*/
/* Array copy utility functions */
/*==========================================================================*/
static
inline void uprv_arrayCopy(const double* src, double* dst, int32_t count)
{ uprv_memcpy(dst, src, (size_t)count * sizeof(*src)); }
static
inline void uprv_arrayCopy(const double* src, int32_t srcStart,
double* dst, int32_t dstStart, int32_t count)
{ uprv_memcpy(dst+dstStart, src+srcStart, (size_t)count * sizeof(*src)); }
static
inline void uprv_arrayCopy(const int8_t* src, int8_t* dst, int32_t count)
{ uprv_memcpy(dst, src, (size_t)count * sizeof(*src)); }
static
inline void uprv_arrayCopy(const int8_t* src, int32_t srcStart,
int8_t* dst, int32_t dstStart, int32_t count)
{ uprv_memcpy(dst+dstStart, src+srcStart, (size_t)count * sizeof(*src)); }
static
inline void uprv_arrayCopy(const int16_t* src, int16_t* dst, int32_t count)
{ uprv_memcpy(dst, src, (size_t)count * sizeof(*src)); }
static
inline void uprv_arrayCopy(const int16_t* src, int32_t srcStart,
int16_t* dst, int32_t dstStart, int32_t count)
{ uprv_memcpy(dst+dstStart, src+srcStart, (size_t)count * sizeof(*src)); }
static
inline void uprv_arrayCopy(const int32_t* src, int32_t* dst, int32_t count)
{ uprv_memcpy(dst, src, (size_t)count * sizeof(*src)); }
static
inline void uprv_arrayCopy(const int32_t* src, int32_t srcStart,
int32_t* dst, int32_t dstStart, int32_t count)
{ uprv_memcpy(dst+dstStart, src+srcStart, (size_t)count * sizeof(*src)); }
static
inline void
uprv_arrayCopy(const char16_t *src, int32_t srcStart,
char16_t *dst, int32_t dstStart, int32_t count)
{ uprv_memcpy(dst+dstStart, src+srcStart, (size_t)count * sizeof(*src)); }
/**
* Copy an array of UnicodeString OBJECTS (not pointers).
* @internal
*/
static inline void
uprv_arrayCopy(const icu::UnicodeString *src, icu::UnicodeString *dst, int32_t count)
{ while(count-- > 0) *dst++ = *src++; }
/**
* Copy an array of UnicodeString OBJECTS (not pointers).
* @internal
*/
static inline void
uprv_arrayCopy(const icu::UnicodeString *src, int32_t srcStart,
icu::UnicodeString *dst, int32_t dstStart, int32_t count)
{ uprv_arrayCopy(src+srcStart, dst+dstStart, count); }
/**
* Checks that the string is readable and writable.
* Sets U_ILLEGAL_ARGUMENT_ERROR if the string isBogus() or has an open getBuffer().
*/
inline void
uprv_checkCanGetBuffer(const icu::UnicodeString &s, UErrorCode &errorCode) {
if(U_SUCCESS(errorCode) && s.isBogus()) {
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
}
}
#endif /* _CPPUTILS */

54
engine/thirdparty/icu4c/common/cstr.cpp vendored Normal file
View file

@ -0,0 +1,54 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
* Copyright (C) 2015-2016, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* file name: charstr.cpp
*/
#include "unicode/utypes.h"
#include "unicode/putil.h"
#include "unicode/unistr.h"
#include "cstr.h"
#include "charstr.h"
#include "uinvchar.h"
U_NAMESPACE_BEGIN
CStr::CStr(const UnicodeString &in) {
UErrorCode status = U_ZERO_ERROR;
#if !UCONFIG_NO_CONVERSION || U_CHARSET_IS_UTF8
int32_t length = in.extract(0, in.length(), static_cast<char *>(nullptr), static_cast<uint32_t>(0));
int32_t resultCapacity = 0;
char *buf = s.getAppendBuffer(length, length, resultCapacity, status);
if (U_SUCCESS(status)) {
in.extract(0, in.length(), buf, resultCapacity);
s.append(buf, length, status);
}
#else
// No conversion available. Convert any invariant characters; substitute '?' for the rest.
// Note: can't just call u_UCharsToChars() or CharString.appendInvariantChars() on the
// whole string because they require that the entire input be invariant.
char buf[2];
for (int i=0; i<in.length(); i = in.moveIndex32(i, 1)) {
if (uprv_isInvariantUString(in.getBuffer()+i, 1)) {
u_UCharsToChars(in.getBuffer()+i, buf, 1);
} else {
buf[0] = '?';
}
s.append(buf, 1, status);
}
#endif
}
CStr::~CStr() {
}
const char * CStr::operator ()() const {
return s.data();
}
U_NAMESPACE_END

60
engine/thirdparty/icu4c/common/cstr.h vendored Normal file
View file

@ -0,0 +1,60 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
******************************************************************************
*
* Copyright (C) 2016, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
*
* File: cstr.h
*/
#ifndef CSTR_H
#define CSTR_H
#include "unicode/unistr.h"
#include "unicode/uobject.h"
#include "unicode/utypes.h"
#include "charstr.h"
/**
* ICU-internal class CStr, a small helper class to facilitate passing UnicodeStrings
* to functions needing (const char *) strings, such as printf().
*
* It is intended primarily for use in debugging or in tests. Uses platform
* default code page conversion, which will do the best job possible,
* but may be lossy, depending on the platform.
*
* If no other conversion is available, use invariant conversion and substitute
* '?' for non-invariant characters.
*
* Example Usage:
* UnicodeString s = whatever;
* printf("%s", CStr(s)());
*
* The explicit call to the CStr() constructor creates a temporary object.
* Operator () on the temporary object returns a (const char *) pointer.
* The lifetime of the (const char *) data is that of the temporary object,
* which works well when passing it as a parameter to another function, such as printf.
*/
U_NAMESPACE_BEGIN
class U_COMMON_API CStr : public UMemory {
public:
CStr(const UnicodeString &in);
~CStr();
const char * operator ()() const;
private:
CharString s;
CStr(const CStr &other) = delete; // Forbid copying of this class.
CStr &operator =(const CStr &other) = delete; // Forbid assignment.
};
U_NAMESPACE_END
#endif

View file

@ -0,0 +1,341 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
******************************************************************************
*
* Copyright (C) 1997-2011, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
*
* File CSTRING.C
*
* @author Helena Shih
*
* Modification History:
*
* Date Name Description
* 6/18/98 hshih Created
* 09/08/98 stephen Added include for ctype, for Mac Port
* 11/15/99 helena Integrated S/390 IEEE changes.
******************************************************************************
*/
#include <stdlib.h>
#include <stdio.h>
#include "unicode/utypes.h"
#include "cmemory.h"
#include "cstring.h"
#include "uassert.h"
/*
* We hardcode case conversion for invariant characters to match our expectation
* and the compiler execution charset.
* This prevents problems on systems
* - with non-default casing behavior, like Turkish system locales where
* tolower('I') maps to dotless i and toupper('i') maps to dotted I
* - where there are no lowercase Latin characters at all, or using different
* codes (some old EBCDIC codepages)
*
* This works because the compiler usually runs on a platform where the execution
* charset includes all of the invariant characters at their expected
* code positions, so that the char * string literals in ICU code match
* the char literals here.
*
* Note that the set of lowercase Latin letters is discontiguous in EBCDIC
* and the set of uppercase Latin letters is discontiguous as well.
*/
U_CAPI UBool U_EXPORT2
uprv_isASCIILetter(char c) {
#if U_CHARSET_FAMILY==U_EBCDIC_FAMILY
return
('a'<=c && c<='i') || ('j'<=c && c<='r') || ('s'<=c && c<='z') ||
('A'<=c && c<='I') || ('J'<=c && c<='R') || ('S'<=c && c<='Z');
#else
return ('a'<=c && c<='z') || ('A'<=c && c<='Z');
#endif
}
U_CAPI char U_EXPORT2
uprv_toupper(char c) {
#if U_CHARSET_FAMILY==U_EBCDIC_FAMILY
if(('a'<=c && c<='i') || ('j'<=c && c<='r') || ('s'<=c && c<='z')) {
c=(char)(c+('A'-'a'));
}
#else
if('a'<=c && c<='z') {
c=(char)(c+('A'-'a'));
}
#endif
return c;
}
#if 0
/*
* Commented out because cstring.h defines uprv_tolower() to be
* the same as either uprv_asciitolower() or uprv_ebcdictolower()
* to reduce the amount of code to cover with tests.
*
* Note that this uprv_tolower() definition is likely to work for most
* charset families, not just ASCII and EBCDIC, because its #else branch
* is written generically.
*/
U_CAPI char U_EXPORT2
uprv_tolower(char c) {
#if U_CHARSET_FAMILY==U_EBCDIC_FAMILY
if(('A'<=c && c<='I') || ('J'<=c && c<='R') || ('S'<=c && c<='Z')) {
c=(char)(c+('a'-'A'));
}
#else
if('A'<=c && c<='Z') {
c=(char)(c+('a'-'A'));
}
#endif
return c;
}
#endif
U_CAPI char U_EXPORT2
uprv_asciitolower(char c) {
if(0x41<=c && c<=0x5a) {
c=(char)(c+0x20);
}
return c;
}
U_CAPI char U_EXPORT2
uprv_ebcdictolower(char c) {
if( (0xc1<=(uint8_t)c && (uint8_t)c<=0xc9) ||
(0xd1<=(uint8_t)c && (uint8_t)c<=0xd9) ||
(0xe2<=(uint8_t)c && (uint8_t)c<=0xe9)
) {
c=(char)(c-0x40);
}
return c;
}
U_CAPI char* U_EXPORT2
T_CString_toLowerCase(char* str)
{
char* origPtr = str;
if (str) {
do
*str = (char)uprv_tolower(*str);
while (*(str++));
}
return origPtr;
}
U_CAPI char* U_EXPORT2
T_CString_toUpperCase(char* str)
{
char* origPtr = str;
if (str) {
do
*str = (char)uprv_toupper(*str);
while (*(str++));
}
return origPtr;
}
/*
* Takes a int32_t and fills in a char* string with that number "radix"-based.
* Does not handle negative values (makes an empty string for them).
* Writes at most 12 chars ("-2147483647" plus NUL).
* Returns the length of the string (not including the NUL).
*/
U_CAPI int32_t U_EXPORT2
T_CString_integerToString(char* buffer, int32_t v, int32_t radix)
{
char tbuf[30];
int32_t tbx = sizeof(tbuf);
uint8_t digit;
int32_t length = 0;
uint32_t uval;
U_ASSERT(radix>=2 && radix<=16);
uval = (uint32_t) v;
if(v<0 && radix == 10) {
/* Only in base 10 do we conside numbers to be signed. */
uval = (uint32_t)(-v);
buffer[length++] = '-';
}
tbx = sizeof(tbuf)-1;
tbuf[tbx] = 0; /* We are generating the digits backwards. Null term the end. */
do {
digit = (uint8_t)(uval % radix);
tbuf[--tbx] = (char)(T_CString_itosOffset(digit));
uval = uval / radix;
} while (uval != 0);
/* copy converted number into user buffer */
uprv_strcpy(buffer+length, tbuf+tbx);
length += sizeof(tbuf) - tbx -1;
return length;
}
/*
* Takes a int64_t and fills in a char* string with that number "radix"-based.
* Writes at most 21: chars ("-9223372036854775807" plus NUL).
* Returns the length of the string, not including the terminating NUL.
*/
U_CAPI int32_t U_EXPORT2
T_CString_int64ToString(char* buffer, int64_t v, uint32_t radix)
{
char tbuf[30];
int32_t tbx = sizeof(tbuf);
uint8_t digit;
int32_t length = 0;
uint64_t uval;
U_ASSERT(radix>=2 && radix<=16);
uval = (uint64_t) v;
if(v<0 && radix == 10) {
/* Only in base 10 do we conside numbers to be signed. */
uval = (uint64_t)(-v);
buffer[length++] = '-';
}
tbx = sizeof(tbuf)-1;
tbuf[tbx] = 0; /* We are generating the digits backwards. Null term the end. */
do {
digit = (uint8_t)(uval % radix);
tbuf[--tbx] = (char)(T_CString_itosOffset(digit));
uval = uval / radix;
} while (uval != 0);
/* copy converted number into user buffer */
uprv_strcpy(buffer+length, tbuf+tbx);
length += sizeof(tbuf) - tbx -1;
return length;
}
U_CAPI int32_t U_EXPORT2
T_CString_stringToInteger(const char *integerString, int32_t radix)
{
char *end;
return uprv_strtoul(integerString, &end, radix);
}
U_CAPI int U_EXPORT2
uprv_stricmp(const char *str1, const char *str2) {
if(str1==nullptr) {
if(str2==nullptr) {
return 0;
} else {
return -1;
}
} else if(str2==nullptr) {
return 1;
} else {
/* compare non-nullptr strings lexically with lowercase */
int rc;
unsigned char c1, c2;
for(;;) {
c1=(unsigned char)*str1;
c2=(unsigned char)*str2;
if(c1==0) {
if(c2==0) {
return 0;
} else {
return -1;
}
} else if(c2==0) {
return 1;
} else {
/* compare non-zero characters with lowercase */
rc=(int)(unsigned char)uprv_tolower(c1)-(int)(unsigned char)uprv_tolower(c2);
if(rc!=0) {
return rc;
}
}
++str1;
++str2;
}
}
}
U_CAPI int U_EXPORT2
uprv_strnicmp(const char *str1, const char *str2, uint32_t n) {
if(str1==nullptr) {
if(str2==nullptr) {
return 0;
} else {
return -1;
}
} else if(str2==nullptr) {
return 1;
} else {
/* compare non-nullptr strings lexically with lowercase */
int rc;
unsigned char c1, c2;
for(; n--;) {
c1=(unsigned char)*str1;
c2=(unsigned char)*str2;
if(c1==0) {
if(c2==0) {
return 0;
} else {
return -1;
}
} else if(c2==0) {
return 1;
} else {
/* compare non-zero characters with lowercase */
rc=(int)(unsigned char)uprv_tolower(c1)-(int)(unsigned char)uprv_tolower(c2);
if(rc!=0) {
return rc;
}
}
++str1;
++str2;
}
}
return 0;
}
U_CAPI char* U_EXPORT2
uprv_strdup(const char *src) {
size_t len = uprv_strlen(src) + 1;
char *dup = (char *) uprv_malloc(len);
if (dup) {
uprv_memcpy(dup, src, len);
}
return dup;
}
U_CAPI char* U_EXPORT2
uprv_strndup(const char *src, int32_t n) {
char *dup;
if(n < 0) {
dup = uprv_strdup(src);
} else {
dup = (char*)uprv_malloc(n+1);
if (dup) {
uprv_memcpy(dup, src, n);
dup[n] = 0;
}
}
return dup;
}

126
engine/thirdparty/icu4c/common/cstring.h vendored Normal file
View file

@ -0,0 +1,126 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
******************************************************************************
*
* Copyright (C) 1997-2012, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
*
* File CSTRING.H
*
* Contains CString interface
*
* @author Helena Shih
*
* Modification History:
*
* Date Name Description
* 6/17/98 hshih Created.
* 05/03/99 stephen Changed from functions to macros.
* 06/14/99 stephen Added icu_strncat, icu_strncmp, icu_tolower
*
******************************************************************************
*/
#ifndef CSTRING_H
#define CSTRING_H 1
#include "unicode/utypes.h"
#include "cmemory.h"
#include <string.h>
#include <stdlib.h>
#include <ctype.h>
#define uprv_strcpy(dst, src) U_STANDARD_CPP_NAMESPACE strcpy(dst, src)
#define uprv_strlen(str) U_STANDARD_CPP_NAMESPACE strlen(str)
#define uprv_strcmp(s1, s2) U_STANDARD_CPP_NAMESPACE strcmp(s1, s2)
#define uprv_strcat(dst, src) U_STANDARD_CPP_NAMESPACE strcat(dst, src)
#define uprv_strchr(s, c) U_STANDARD_CPP_NAMESPACE strchr(s, c)
#define uprv_strstr(s, c) U_STANDARD_CPP_NAMESPACE strstr(s, c)
#define uprv_strrchr(s, c) U_STANDARD_CPP_NAMESPACE strrchr(s, c)
#define uprv_strncpy(dst, src, size) U_STANDARD_CPP_NAMESPACE strncpy(dst, src, size)
#define uprv_strncmp(s1, s2, n) U_STANDARD_CPP_NAMESPACE strncmp(s1, s2, n)
#define uprv_strncat(dst, src, n) U_STANDARD_CPP_NAMESPACE strncat(dst, src, n)
/**
* Is c an ASCII-repertoire letter a-z or A-Z?
* Note: The implementation is specific to whether ICU is compiled for
* an ASCII-based or EBCDIC-based machine. There just does not seem to be a better name for this.
*/
U_CAPI UBool U_EXPORT2
uprv_isASCIILetter(char c);
// NOTE: For u_asciiToUpper that takes a UChar, see ustr_imp.h
U_CAPI char U_EXPORT2
uprv_toupper(char c);
U_CAPI char U_EXPORT2
uprv_asciitolower(char c);
U_CAPI char U_EXPORT2
uprv_ebcdictolower(char c);
#if U_CHARSET_FAMILY==U_ASCII_FAMILY
# define uprv_tolower uprv_asciitolower
#elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY
# define uprv_tolower uprv_ebcdictolower
#else
# error U_CHARSET_FAMILY is not valid
#endif
#define uprv_strtod(source, end) U_STANDARD_CPP_NAMESPACE strtod(source, end)
#define uprv_strtoul(str, end, base) U_STANDARD_CPP_NAMESPACE strtoul(str, end, base)
#define uprv_strtol(str, end, base) U_STANDARD_CPP_NAMESPACE strtol(str, end, base)
/* Conversion from a digit to the character with radix base from 2-19 */
/* May need to use U_UPPER_ORDINAL*/
#define T_CString_itosOffset(a) ((a)<=9?('0'+(a)):('A'+(a)-10))
U_CAPI char* U_EXPORT2
uprv_strdup(const char *src);
/**
* uprv_malloc n+1 bytes, and copy n bytes from src into the new string.
* Terminate with a null at offset n. If n is -1, works like uprv_strdup
* @param src
* @param n length of the input string, not including null.
* @return new string (owned by caller, use uprv_free to free).
* @internal
*/
U_CAPI char* U_EXPORT2
uprv_strndup(const char *src, int32_t n);
U_CAPI char* U_EXPORT2
T_CString_toLowerCase(char* str);
U_CAPI char* U_EXPORT2
T_CString_toUpperCase(char* str);
U_CAPI int32_t U_EXPORT2
T_CString_integerToString(char *buffer, int32_t n, int32_t radix);
U_CAPI int32_t U_EXPORT2
T_CString_int64ToString(char *buffer, int64_t n, uint32_t radix);
U_CAPI int32_t U_EXPORT2
T_CString_stringToInteger(const char *integerString, int32_t radix);
/**
* Case-insensitive, language-independent string comparison
* limited to the ASCII character repertoire.
*/
U_CAPI int U_EXPORT2
uprv_stricmp(const char *str1, const char *str2);
/**
* Case-insensitive, language-independent string comparison
* limited to the ASCII character repertoire.
*/
U_CAPI int U_EXPORT2
uprv_strnicmp(const char *str1, const char *str2, uint32_t n);
#endif /* ! CSTRING_H */

View file

@ -0,0 +1,55 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
******************************************************************************
*
* Copyright (C) 2001, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
* file name: cwchar.c
* encoding: UTF-8
* tab size: 8 (not used)
* indentation:4
*
* created on: 2001may25
* created by: Markus W. Scherer
*/
#include "unicode/utypes.h"
#if !U_HAVE_WCSCPY
#include "cwchar.h"
U_CAPI wchar_t *uprv_wcscat(wchar_t *dst, const wchar_t *src) {
wchar_t *start=dst;
while(*dst!=0) {
++dst;
}
while((*dst=*src)!=0) {
++dst;
++src;
}
return start;
}
U_CAPI wchar_t *uprv_wcscpy(wchar_t *dst, const wchar_t *src) {
wchar_t *start=dst;
while((*dst=*src)!=0) {
++dst;
++src;
}
return start;
}
U_CAPI size_t uprv_wcslen(const wchar_t *src) {
const wchar_t *start=src;
while(*src!=0) {
++src;
}
return src-start;
}
#endif

58
engine/thirdparty/icu4c/common/cwchar.h vendored Normal file
View file

@ -0,0 +1,58 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
******************************************************************************
*
* Copyright (C) 2001, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
* file name: cwchar.h
* encoding: UTF-8
* tab size: 8 (not used)
* indentation:4
*
* created on: 2001may25
* created by: Markus W. Scherer
*
* This file contains ICU-internal definitions of wchar_t operations.
* These definitions were moved here from cstring.h so that fewer
* ICU implementation files include wchar.h.
*/
#ifndef __CWCHAR_H__
#define __CWCHAR_H__
#include <string.h>
#include <stdlib.h>
#include "unicode/utypes.h"
/* Do this after utypes.h so that we have U_HAVE_WCHAR_H . */
#if U_HAVE_WCHAR_H
# include <wchar.h>
#endif
/*===========================================================================*/
/* Wide-character functions */
/*===========================================================================*/
/* The following are not available on all systems, defined in wchar.h or string.h. */
#if U_HAVE_WCSCPY
# define uprv_wcscpy wcscpy
# define uprv_wcscat wcscat
# define uprv_wcslen wcslen
#else
U_CAPI wchar_t* U_EXPORT2
uprv_wcscpy(wchar_t *dst, const wchar_t *src);
U_CAPI wchar_t* U_EXPORT2
uprv_wcscat(wchar_t *dst, const wchar_t *src);
U_CAPI size_t U_EXPORT2
uprv_wcslen(const wchar_t *src);
#endif
/* The following are part of the ANSI C standard, defined in stdlib.h . */
#define uprv_wcstombs(mbstr, wcstr, count) U_STANDARD_CPP_NAMESPACE wcstombs(mbstr, wcstr, count)
#define uprv_mbstowcs(wcstr, mbstr, count) U_STANDARD_CPP_NAMESPACE mbstowcs(wcstr, mbstr, count)
#endif

1503
engine/thirdparty/icu4c/common/dictbe.cpp vendored Normal file

File diff suppressed because it is too large Load diff

434
engine/thirdparty/icu4c/common/dictbe.h vendored Normal file
View file

@ -0,0 +1,434 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/**
*******************************************************************************
* Copyright (C) 2006-2014, International Business Machines Corporation *
* and others. All Rights Reserved. *
*******************************************************************************
*/
#ifndef DICTBE_H
#define DICTBE_H
#include "unicode/utypes.h"
#include "unicode/uniset.h"
#include "unicode/utext.h"
#include "brkeng.h"
#include "hash.h"
#include "mlbe.h"
#include "uvectr32.h"
U_NAMESPACE_BEGIN
class DictionaryMatcher;
class MlBreakEngine;
class Normalizer2;
/*******************************************************************
* DictionaryBreakEngine
*/
/**
* <p>DictionaryBreakEngine is a kind of LanguageBreakEngine that uses a
* dictionary to determine language-specific breaks.</p>
*
* <p>After it is constructed a DictionaryBreakEngine may be shared between
* threads without synchronization.</p>
*/
class DictionaryBreakEngine : public LanguageBreakEngine {
private:
/**
* The set of characters handled by this engine
* @internal
*/
UnicodeSet fSet;
public:
/**
* <p>Constructor </p>
*/
DictionaryBreakEngine();
/**
* <p>Virtual destructor.</p>
*/
virtual ~DictionaryBreakEngine();
/**
* <p>Indicate whether this engine handles a particular character for
* a particular kind of break.</p>
*
* @param c A character which begins a run that the engine might handle
* @param locale The locale.
* @return true if this engine handles the particular character and break
* type.
*/
virtual UBool handles(UChar32 c, const char* locale) const override;
/**
* <p>Find any breaks within a run in the supplied text.</p>
*
* @param text A UText representing the text. The iterator is left at
* the end of the run of characters which the engine is capable of handling
* that starts from the first character in the range.
* @param startPos The start of the run within the supplied text.
* @param endPos The end of the run within the supplied text.
* @param foundBreaks vector of int32_t to receive the break positions
* @param status Information on any errors encountered.
* @return The number of breaks found.
*/
virtual int32_t findBreaks( UText *text,
int32_t startPos,
int32_t endPos,
UVector32 &foundBreaks,
UBool isPhraseBreaking,
UErrorCode& status ) const override;
protected:
/**
* <p>Set the character set handled by this engine.</p>
*
* @param set A UnicodeSet of the set of characters handled by the engine
*/
virtual void setCharacters( const UnicodeSet &set );
/**
* <p>Divide up a range of known dictionary characters handled by this break engine.</p>
*
* @param text A UText representing the text
* @param rangeStart The start of the range of dictionary characters
* @param rangeEnd The end of the range of dictionary characters
* @param foundBreaks Output of C array of int32_t break positions, or 0
* @param status Information on any errors encountered.
* @return The number of breaks found
*/
virtual int32_t divideUpDictionaryRange( UText *text,
int32_t rangeStart,
int32_t rangeEnd,
UVector32 &foundBreaks,
UBool isPhraseBreaking,
UErrorCode& status) const = 0;
};
/*******************************************************************
* ThaiBreakEngine
*/
/**
* <p>ThaiBreakEngine is a kind of DictionaryBreakEngine that uses a
* dictionary and heuristics to determine Thai-specific breaks.</p>
*
* <p>After it is constructed a ThaiBreakEngine may be shared between
* threads without synchronization.</p>
*/
class ThaiBreakEngine : public DictionaryBreakEngine {
private:
/**
* The set of characters handled by this engine
* @internal
*/
UnicodeSet fEndWordSet;
UnicodeSet fBeginWordSet;
UnicodeSet fSuffixSet;
UnicodeSet fMarkSet;
DictionaryMatcher *fDictionary;
public:
/**
* <p>Default constructor.</p>
*
* @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
* engine is deleted.
*/
ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
/**
* <p>Virtual destructor.</p>
*/
virtual ~ThaiBreakEngine();
protected:
/**
* <p>Divide up a range of known dictionary characters handled by this break engine.</p>
*
* @param text A UText representing the text
* @param rangeStart The start of the range of dictionary characters
* @param rangeEnd The end of the range of dictionary characters
* @param foundBreaks Output of C array of int32_t break positions, or 0
* @param status Information on any errors encountered.
* @return The number of breaks found
*/
virtual int32_t divideUpDictionaryRange( UText *text,
int32_t rangeStart,
int32_t rangeEnd,
UVector32 &foundBreaks,
UBool isPhraseBreaking,
UErrorCode& status) const override;
};
/*******************************************************************
* LaoBreakEngine
*/
/**
* <p>LaoBreakEngine is a kind of DictionaryBreakEngine that uses a
* dictionary and heuristics to determine Lao-specific breaks.</p>
*
* <p>After it is constructed a LaoBreakEngine may be shared between
* threads without synchronization.</p>
*/
class LaoBreakEngine : public DictionaryBreakEngine {
private:
/**
* The set of characters handled by this engine
* @internal
*/
UnicodeSet fEndWordSet;
UnicodeSet fBeginWordSet;
UnicodeSet fMarkSet;
DictionaryMatcher *fDictionary;
public:
/**
* <p>Default constructor.</p>
*
* @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
* engine is deleted.
*/
LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
/**
* <p>Virtual destructor.</p>
*/
virtual ~LaoBreakEngine();
protected:
/**
* <p>Divide up a range of known dictionary characters handled by this break engine.</p>
*
* @param text A UText representing the text
* @param rangeStart The start of the range of dictionary characters
* @param rangeEnd The end of the range of dictionary characters
* @param foundBreaks Output of C array of int32_t break positions, or 0
* @param status Information on any errors encountered.
* @return The number of breaks found
*/
virtual int32_t divideUpDictionaryRange( UText *text,
int32_t rangeStart,
int32_t rangeEnd,
UVector32 &foundBreaks,
UBool isPhraseBreaking,
UErrorCode& status) const override;
};
/*******************************************************************
* BurmeseBreakEngine
*/
/**
* <p>BurmeseBreakEngine is a kind of DictionaryBreakEngine that uses a
* DictionaryMatcher and heuristics to determine Burmese-specific breaks.</p>
*
* <p>After it is constructed a BurmeseBreakEngine may be shared between
* threads without synchronization.</p>
*/
class BurmeseBreakEngine : public DictionaryBreakEngine {
private:
/**
* The set of characters handled by this engine
* @internal
*/
UnicodeSet fEndWordSet;
UnicodeSet fBeginWordSet;
UnicodeSet fMarkSet;
DictionaryMatcher *fDictionary;
public:
/**
* <p>Default constructor.</p>
*
* @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
* engine is deleted.
*/
BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
/**
* <p>Virtual destructor.</p>
*/
virtual ~BurmeseBreakEngine();
protected:
/**
* <p>Divide up a range of known dictionary characters.</p>
*
* @param text A UText representing the text
* @param rangeStart The start of the range of dictionary characters
* @param rangeEnd The end of the range of dictionary characters
* @param foundBreaks Output of C array of int32_t break positions, or 0
* @param status Information on any errors encountered.
* @return The number of breaks found
*/
virtual int32_t divideUpDictionaryRange( UText *text,
int32_t rangeStart,
int32_t rangeEnd,
UVector32 &foundBreaks,
UBool isPhraseBreaking,
UErrorCode& status) const override;
};
/*******************************************************************
* KhmerBreakEngine
*/
/**
* <p>KhmerBreakEngine is a kind of DictionaryBreakEngine that uses a
* DictionaryMatcher and heuristics to determine Khmer-specific breaks.</p>
*
* <p>After it is constructed a KhmerBreakEngine may be shared between
* threads without synchronization.</p>
*/
class KhmerBreakEngine : public DictionaryBreakEngine {
private:
/**
* The set of characters handled by this engine
* @internal
*/
UnicodeSet fEndWordSet;
UnicodeSet fBeginWordSet;
UnicodeSet fMarkSet;
DictionaryMatcher *fDictionary;
public:
/**
* <p>Default constructor.</p>
*
* @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
* engine is deleted.
*/
KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
/**
* <p>Virtual destructor.</p>
*/
virtual ~KhmerBreakEngine();
protected:
/**
* <p>Divide up a range of known dictionary characters.</p>
*
* @param text A UText representing the text
* @param rangeStart The start of the range of dictionary characters
* @param rangeEnd The end of the range of dictionary characters
* @param foundBreaks Output of C array of int32_t break positions, or 0
* @param status Information on any errors encountered.
* @return The number of breaks found
*/
virtual int32_t divideUpDictionaryRange( UText *text,
int32_t rangeStart,
int32_t rangeEnd,
UVector32 &foundBreaks,
UBool isPhraseBreaking,
UErrorCode& status) const override;
};
#if !UCONFIG_NO_NORMALIZATION
/*******************************************************************
* CjkBreakEngine
*/
//indicates language/script that the CjkBreakEngine will handle
enum LanguageType {
kKorean,
kChineseJapanese
};
/**
* <p>CjkBreakEngine is a kind of DictionaryBreakEngine that uses a
* dictionary with costs associated with each word and
* Viterbi decoding to determine CJK-specific breaks.</p>
*/
class CjkBreakEngine : public DictionaryBreakEngine {
protected:
/**
* The set of characters handled by this engine
* @internal
*/
UnicodeSet fHangulWordSet;
UnicodeSet fDigitOrOpenPunctuationOrAlphabetSet;
UnicodeSet fClosePunctuationSet;
DictionaryMatcher *fDictionary;
const Normalizer2 *nfkcNorm2;
MlBreakEngine *fMlBreakEngine;
bool isCj;
private:
// Load Japanese extensions.
void loadJapaneseExtensions(UErrorCode& error);
// Load Japanese Hiragana.
void loadHiragana(UErrorCode& error);
// Initialize fSkipSet by loading Japanese Hiragana and extensions.
void initJapanesePhraseParameter(UErrorCode& error);
Hashtable fSkipSet;
public:
/**
* <p>Default constructor.</p>
*
* @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
* engine is deleted. The DictionaryMatcher must contain costs for each word
* in order for the dictionary to work properly.
*/
CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status);
/**
* <p>Virtual destructor.</p>
*/
virtual ~CjkBreakEngine();
protected:
/**
* <p>Divide up a range of known dictionary characters handled by this break engine.</p>
*
* @param text A UText representing the text
* @param rangeStart The start of the range of dictionary characters
* @param rangeEnd The end of the range of dictionary characters
* @param foundBreaks Output of C array of int32_t break positions, or 0
* @param status Information on any errors encountered.
* @return The number of breaks found
*/
virtual int32_t divideUpDictionaryRange( UText *text,
int32_t rangeStart,
int32_t rangeEnd,
UVector32 &foundBreaks,
UBool isPhraseBreaking,
UErrorCode& status) const override;
};
#endif
U_NAMESPACE_END
/* DICTBE_H */
#endif

View file

@ -0,0 +1,242 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
* Copyright (C) 2014-2016, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* dictionarydata.h
*
* created on: 2012may31
* created by: Markus W. Scherer & Maxime Serrano
*/
#include "dictionarydata.h"
#include "unicode/ucharstrie.h"
#include "unicode/bytestrie.h"
#include "unicode/udata.h"
#include "cmemory.h"
#if !UCONFIG_NO_BREAK_ITERATION
U_NAMESPACE_BEGIN
const int32_t DictionaryData::TRIE_TYPE_BYTES = 0;
const int32_t DictionaryData::TRIE_TYPE_UCHARS = 1;
const int32_t DictionaryData::TRIE_TYPE_MASK = 7;
const int32_t DictionaryData::TRIE_HAS_VALUES = 8;
const int32_t DictionaryData::TRANSFORM_NONE = 0;
const int32_t DictionaryData::TRANSFORM_TYPE_OFFSET = 0x1000000;
const int32_t DictionaryData::TRANSFORM_TYPE_MASK = 0x7f000000;
const int32_t DictionaryData::TRANSFORM_OFFSET_MASK = 0x1fffff;
DictionaryMatcher::~DictionaryMatcher() {
}
UCharsDictionaryMatcher::~UCharsDictionaryMatcher() {
udata_close(file);
}
int32_t UCharsDictionaryMatcher::getType() const {
return DictionaryData::TRIE_TYPE_UCHARS;
}
int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit,
int32_t *lengths, int32_t *cpLengths, int32_t *values,
int32_t *prefix) const {
UCharsTrie uct(characters);
int32_t startingTextIndex = (int32_t)utext_getNativeIndex(text);
int32_t wordCount = 0;
int32_t codePointsMatched = 0;
for (UChar32 c = utext_next32(text); c >= 0; c=utext_next32(text)) {
UStringTrieResult result = (codePointsMatched == 0) ? uct.first(c) : uct.next(c);
int32_t lengthMatched = (int32_t)utext_getNativeIndex(text) - startingTextIndex;
codePointsMatched += 1;
if (USTRINGTRIE_HAS_VALUE(result)) {
if (wordCount < limit) {
if (values != nullptr) {
values[wordCount] = uct.getValue();
}
if (lengths != nullptr) {
lengths[wordCount] = lengthMatched;
}
if (cpLengths != nullptr) {
cpLengths[wordCount] = codePointsMatched;
}
++wordCount;
}
if (result == USTRINGTRIE_FINAL_VALUE) {
break;
}
}
else if (result == USTRINGTRIE_NO_MATCH) {
break;
}
if (lengthMatched >= maxLength) {
break;
}
}
if (prefix != nullptr) {
*prefix = codePointsMatched;
}
return wordCount;
}
BytesDictionaryMatcher::~BytesDictionaryMatcher() {
udata_close(file);
}
UChar32 BytesDictionaryMatcher::transform(UChar32 c) const {
if ((transformConstant & DictionaryData::TRANSFORM_TYPE_MASK) == DictionaryData::TRANSFORM_TYPE_OFFSET) {
if (c == 0x200D) {
return 0xFF;
} else if (c == 0x200C) {
return 0xFE;
}
int32_t delta = c - (transformConstant & DictionaryData::TRANSFORM_OFFSET_MASK);
if (delta < 0 || 0xFD < delta) {
return U_SENTINEL;
}
return (UChar32)delta;
}
return c;
}
int32_t BytesDictionaryMatcher::getType() const {
return DictionaryData::TRIE_TYPE_BYTES;
}
int32_t BytesDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit,
int32_t *lengths, int32_t *cpLengths, int32_t *values,
int32_t *prefix) const {
BytesTrie bt(characters);
int32_t startingTextIndex = (int32_t)utext_getNativeIndex(text);
int32_t wordCount = 0;
int32_t codePointsMatched = 0;
for (UChar32 c = utext_next32(text); c >= 0; c=utext_next32(text)) {
UStringTrieResult result = (codePointsMatched == 0) ? bt.first(transform(c)) : bt.next(transform(c));
int32_t lengthMatched = (int32_t)utext_getNativeIndex(text) - startingTextIndex;
codePointsMatched += 1;
if (USTRINGTRIE_HAS_VALUE(result)) {
if (wordCount < limit) {
if (values != nullptr) {
values[wordCount] = bt.getValue();
}
if (lengths != nullptr) {
lengths[wordCount] = lengthMatched;
}
if (cpLengths != nullptr) {
cpLengths[wordCount] = codePointsMatched;
}
++wordCount;
}
if (result == USTRINGTRIE_FINAL_VALUE) {
break;
}
}
else if (result == USTRINGTRIE_NO_MATCH) {
break;
}
if (lengthMatched >= maxLength) {
break;
}
}
if (prefix != nullptr) {
*prefix = codePointsMatched;
}
return wordCount;
}
U_NAMESPACE_END
U_NAMESPACE_USE
U_CAPI int32_t U_EXPORT2
udict_swap(const UDataSwapper *ds, const void *inData, int32_t length,
void *outData, UErrorCode *pErrorCode) {
const UDataInfo *pInfo;
int32_t headerSize;
const uint8_t *inBytes;
uint8_t *outBytes;
const int32_t *inIndexes;
int32_t indexes[DictionaryData::IX_COUNT];
int32_t i, offset, size;
headerSize = udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
if (pErrorCode == nullptr || U_FAILURE(*pErrorCode)) return 0;
pInfo = (const UDataInfo *)((const char *)inData + 4);
if (!(pInfo->dataFormat[0] == 0x44 &&
pInfo->dataFormat[1] == 0x69 &&
pInfo->dataFormat[2] == 0x63 &&
pInfo->dataFormat[3] == 0x74 &&
pInfo->formatVersion[0] == 1)) {
udata_printError(ds, "udict_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as dictionary data\n",
pInfo->dataFormat[0], pInfo->dataFormat[1], pInfo->dataFormat[2], pInfo->dataFormat[3], pInfo->formatVersion[0]);
*pErrorCode = U_UNSUPPORTED_ERROR;
return 0;
}
inBytes = (const uint8_t *)inData + headerSize;
outBytes = (outData == nullptr) ? nullptr : (uint8_t *)outData + headerSize;
inIndexes = (const int32_t *)inBytes;
if (length >= 0) {
length -= headerSize;
if (length < (int32_t)(sizeof(indexes))) {
udata_printError(ds, "udict_swap(): too few bytes (%d after header) for dictionary data\n", length);
*pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
return 0;
}
}
for (i = 0; i < DictionaryData::IX_COUNT; i++) {
indexes[i] = udata_readInt32(ds, inIndexes[i]);
}
size = indexes[DictionaryData::IX_TOTAL_SIZE];
if (length >= 0) {
if (length < size) {
udata_printError(ds, "udict_swap(): too few bytes (%d after header) for all of dictionary data\n", length);
*pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
return 0;
}
if (inBytes != outBytes) {
uprv_memcpy(outBytes, inBytes, size);
}
offset = 0;
ds->swapArray32(ds, inBytes, sizeof(indexes), outBytes, pErrorCode);
offset = (int32_t)sizeof(indexes);
int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;
int32_t nextOffset = indexes[DictionaryData::IX_RESERVED1_OFFSET];
if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {
ds->swapArray16(ds, inBytes + offset, nextOffset - offset, outBytes + offset, pErrorCode);
} else if (trieType == DictionaryData::TRIE_TYPE_BYTES) {
// nothing to do
} else {
udata_printError(ds, "udict_swap(): unknown trie type!\n");
*pErrorCode = U_UNSUPPORTED_ERROR;
return 0;
}
// these next two sections are empty in the current format,
// but may be used later.
offset = nextOffset;
nextOffset = indexes[DictionaryData::IX_RESERVED2_OFFSET];
offset = nextOffset;
nextOffset = indexes[DictionaryData::IX_TOTAL_SIZE];
offset = nextOffset;
}
return headerSize + size;
}
#endif

View file

@ -0,0 +1,191 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
* Copyright (C) 2014, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* dictionarydata.h
*
* created on: 2012may31
* created by: Markus W. Scherer & Maxime Serrano
*/
#ifndef __DICTIONARYDATA_H__
#define __DICTIONARYDATA_H__
#include "unicode/utypes.h"
#if !UCONFIG_NO_BREAK_ITERATION
#include "unicode/utext.h"
#include "unicode/udata.h"
#include "udataswp.h"
#include "unicode/uobject.h"
#include "unicode/ustringtrie.h"
U_NAMESPACE_BEGIN
class UCharsTrie;
class BytesTrie;
class U_COMMON_API DictionaryData : public UMemory {
public:
static const int32_t TRIE_TYPE_BYTES; // = 0;
static const int32_t TRIE_TYPE_UCHARS; // = 1;
static const int32_t TRIE_TYPE_MASK; // = 7;
static const int32_t TRIE_HAS_VALUES; // = 8;
static const int32_t TRANSFORM_NONE; // = 0;
static const int32_t TRANSFORM_TYPE_OFFSET; // = 0x1000000;
static const int32_t TRANSFORM_TYPE_MASK; // = 0x7f000000;
static const int32_t TRANSFORM_OFFSET_MASK; // = 0x1fffff;
enum {
// Byte offsets from the start of the data, after the generic header.
IX_STRING_TRIE_OFFSET,
IX_RESERVED1_OFFSET,
IX_RESERVED2_OFFSET,
IX_TOTAL_SIZE,
// Trie type: TRIE_HAS_VALUES | TRIE_TYPE_BYTES etc.
IX_TRIE_TYPE,
// Transform specification: TRANSFORM_TYPE_OFFSET | 0xe00 etc.
IX_TRANSFORM,
IX_RESERVED6,
IX_RESERVED7,
IX_COUNT
};
};
/**
* Wrapper class around generic dictionaries, implementing matches().
* getType() should return a TRIE_TYPE_??? constant from DictionaryData.
*
* All implementations of this interface must be thread-safe if they are to be used inside of the
* dictionary-based break iteration code.
*/
class U_COMMON_API DictionaryMatcher : public UMemory {
public:
DictionaryMatcher() {}
virtual ~DictionaryMatcher();
// this should emulate CompactTrieDictionary::matches()
/* @param text The text in which to look for matching words. Matching begins
* at the current position of the UText.
* @param maxLength The max length of match to consider. Units are the native indexing
* units of the UText.
* @param limit Capacity of output arrays, which is also the maximum number of
* matching words to be found.
* @param lengths output array, filled with the lengths of the matches, in order,
* from shortest to longest. Lengths are in native indexing units
* of the UText. May be nullptr.
* @param cpLengths output array, filled with the lengths of the matches, in order,
* from shortest to longest. Lengths are the number of Unicode code points.
* May be nullptr.
* @param values Output array, filled with the values associated with the words found.
* May be nullptr.
* @param prefix Output parameter, the code point length of the prefix match, even if that
* prefix didn't lead to a complete word. Will always be >= the cpLength
* of the longest complete word matched. May be nullptr.
* @return Number of matching words found.
*/
virtual int32_t matches(UText *text, int32_t maxLength, int32_t limit,
int32_t *lengths, int32_t *cpLengths, int32_t *values,
int32_t *prefix) const = 0;
/** @return DictionaryData::TRIE_TYPE_XYZ */
virtual int32_t getType() const = 0;
};
// Implementation of the DictionaryMatcher interface for a UCharsTrie dictionary
class U_COMMON_API UCharsDictionaryMatcher : public DictionaryMatcher {
public:
// constructs a new UCharsDictionaryMatcher.
// The UDataMemory * will be closed on this object's destruction.
UCharsDictionaryMatcher(const char16_t *c, UDataMemory *f) : characters(c), file(f) { }
virtual ~UCharsDictionaryMatcher();
virtual int32_t matches(UText *text, int32_t maxLength, int32_t limit,
int32_t *lengths, int32_t *cpLengths, int32_t *values,
int32_t *prefix) const override;
virtual int32_t getType() const override;
private:
const char16_t *characters;
UDataMemory *file;
};
// Implementation of the DictionaryMatcher interface for a BytesTrie dictionary
class U_COMMON_API BytesDictionaryMatcher : public DictionaryMatcher {
public:
// constructs a new BytesTrieDictionaryMatcher
// the transform constant should be the constant read from the file, not a masked version!
// the UDataMemory * fed in here will be closed on this object's destruction
BytesDictionaryMatcher(const char *c, int32_t t, UDataMemory *f)
: characters(c), transformConstant(t), file(f) { }
virtual ~BytesDictionaryMatcher();
virtual int32_t matches(UText *text, int32_t maxLength, int32_t limit,
int32_t *lengths, int32_t *cpLengths, int32_t *values,
int32_t *prefix) const override;
virtual int32_t getType() const override;
private:
UChar32 transform(UChar32 c) const;
const char *characters;
int32_t transformConstant;
UDataMemory *file;
};
U_NAMESPACE_END
U_CAPI int32_t U_EXPORT2
udict_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData, UErrorCode *pErrorCode);
/**
* Format of dictionary .dict data files.
* Format version 1.0.
*
* A dictionary .dict data file contains a byte-serialized BytesTrie or
* a UChars-serialized UCharsTrie.
* Such files are used in dictionary-based break iteration (DBBI).
*
* For a BytesTrie, a transformation type is specified for
* transforming Unicode strings into byte sequences.
*
* A .dict file begins with a standard ICU data file header
* (DataHeader, see ucmndata.h and unicode/udata.h).
* The UDataInfo.dataVersion field is currently unused (set to 0.0.0.0).
*
* After the header, the file contains the following parts.
* Constants are defined in the DictionaryData class.
*
* For the data structure of BytesTrie & UCharsTrie see
* https://icu.unicode.org/design/struct/tries
* and the bytestrie.h and ucharstrie.h header files.
*
* int32_t indexes[indexesLength]; -- indexesLength=indexes[IX_STRING_TRIE_OFFSET]/4;
*
* The first four indexes are byte offsets in ascending order.
* Each byte offset marks the start of the next part in the data file,
* and the end of the previous one.
* When two consecutive byte offsets are the same, then the corresponding part is empty.
* Byte offsets are offsets from after the header,
* that is, from the beginning of the indexes[].
* Each part starts at an offset with proper alignment for its data.
* If necessary, the previous part may include padding bytes to achieve this alignment.
*
* trieType=indexes[IX_TRIE_TYPE] defines the trie type.
* transform=indexes[IX_TRANSFORM] defines the Unicode-to-bytes transformation.
* If the transformation type is TRANSFORM_TYPE_OFFSET,
* then the lower 21 bits contain the offset code point.
* Each code point c is mapped to byte b = (c - offset).
* Code points outside the range offset..(offset+0xff) cannot be mapped
* and do not occur in the dictionary.
*
* stringTrie; -- a serialized BytesTrie or UCharsTrie
*
* The dictionary maps strings to specific values (TRIE_HAS_VALUES bit set in trieType),
* or it maps all strings to 0 (TRIE_HAS_VALUES bit not set).
*/
#endif /* !UCONFIG_NO_BREAK_ITERATION */
#endif /* __DICTIONARYDATA_H__ */

View file

@ -0,0 +1,63 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*******************************************************************************
* Copyright (C) 2008, International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*
* File DTINTRV.CPP
*
*******************************************************************************
*/
#include "unicode/dtintrv.h"
U_NAMESPACE_BEGIN
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(DateInterval)
//DateInterval::DateInterval(){}
DateInterval::DateInterval(UDate from, UDate to)
: fromDate(from),
toDate(to)
{}
DateInterval::~DateInterval(){}
DateInterval::DateInterval(const DateInterval& other)
: UObject(other) {
*this = other;
}
DateInterval&
DateInterval::operator=(const DateInterval& other) {
if ( this != &other ) {
fromDate = other.fromDate;
toDate = other.toDate;
}
return *this;
}
DateInterval*
DateInterval::clone() const {
return new DateInterval(*this);
}
bool
DateInterval::operator==(const DateInterval& other) const {
return ( fromDate == other.fromDate && toDate == other.toDate );
}
U_NAMESPACE_END

804
engine/thirdparty/icu4c/common/edits.cpp vendored Normal file
View file

@ -0,0 +1,804 @@
// © 2017 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
// edits.cpp
// created: 2017feb08 Markus W. Scherer
#include "unicode/edits.h"
#include "unicode/unistr.h"
#include "unicode/utypes.h"
#include "cmemory.h"
#include "uassert.h"
#include "util.h"
U_NAMESPACE_BEGIN
namespace {
// 0000uuuuuuuuuuuu records u+1 unchanged text units.
const int32_t MAX_UNCHANGED_LENGTH = 0x1000;
const int32_t MAX_UNCHANGED = MAX_UNCHANGED_LENGTH - 1;
// 0mmmnnnccccccccc with m=1..6 records ccc+1 replacements of m:n text units.
const int32_t MAX_SHORT_CHANGE_OLD_LENGTH = 6;
const int32_t MAX_SHORT_CHANGE_NEW_LENGTH = 7;
const int32_t SHORT_CHANGE_NUM_MASK = 0x1ff;
const int32_t MAX_SHORT_CHANGE = 0x6fff;
// 0111mmmmmmnnnnnn records a replacement of m text units with n.
// m or n = 61: actual length follows in the next edits array unit.
// m or n = 62..63: actual length follows in the next two edits array units.
// Bit 30 of the actual length is in the head unit.
// Trailing units have bit 15 set.
const int32_t LENGTH_IN_1TRAIL = 61;
const int32_t LENGTH_IN_2TRAIL = 62;
} // namespace
void Edits::releaseArray() noexcept {
if (array != stackArray) {
uprv_free(array);
}
}
Edits &Edits::copyArray(const Edits &other) {
if (U_FAILURE(errorCode_)) {
length = delta = numChanges = 0;
return *this;
}
if (length > capacity) {
uint16_t *newArray = (uint16_t *)uprv_malloc((size_t)length * 2);
if (newArray == nullptr) {
length = delta = numChanges = 0;
errorCode_ = U_MEMORY_ALLOCATION_ERROR;
return *this;
}
releaseArray();
array = newArray;
capacity = length;
}
if (length > 0) {
uprv_memcpy(array, other.array, (size_t)length * 2);
}
return *this;
}
Edits &Edits::moveArray(Edits &src) noexcept {
if (U_FAILURE(errorCode_)) {
length = delta = numChanges = 0;
return *this;
}
releaseArray();
if (length > STACK_CAPACITY) {
array = src.array;
capacity = src.capacity;
src.array = src.stackArray;
src.capacity = STACK_CAPACITY;
src.reset();
return *this;
}
array = stackArray;
capacity = STACK_CAPACITY;
if (length > 0) {
uprv_memcpy(array, src.array, (size_t)length * 2);
}
return *this;
}
Edits &Edits::operator=(const Edits &other) {
if (this == &other) { return *this; } // self-assignment: no-op
length = other.length;
delta = other.delta;
numChanges = other.numChanges;
errorCode_ = other.errorCode_;
return copyArray(other);
}
Edits &Edits::operator=(Edits &&src) noexcept {
length = src.length;
delta = src.delta;
numChanges = src.numChanges;
errorCode_ = src.errorCode_;
return moveArray(src);
}
Edits::~Edits() {
releaseArray();
}
void Edits::reset() noexcept {
length = delta = numChanges = 0;
errorCode_ = U_ZERO_ERROR;
}
void Edits::addUnchanged(int32_t unchangedLength) {
if(U_FAILURE(errorCode_) || unchangedLength == 0) { return; }
if(unchangedLength < 0) {
errorCode_ = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
// Merge into previous unchanged-text record, if any.
int32_t last = lastUnit();
if(last < MAX_UNCHANGED) {
int32_t remaining = MAX_UNCHANGED - last;
if (remaining >= unchangedLength) {
setLastUnit(last + unchangedLength);
return;
}
setLastUnit(MAX_UNCHANGED);
unchangedLength -= remaining;
}
// Split large lengths into multiple units.
while(unchangedLength >= MAX_UNCHANGED_LENGTH) {
append(MAX_UNCHANGED);
unchangedLength -= MAX_UNCHANGED_LENGTH;
}
// Write a small (remaining) length.
if(unchangedLength > 0) {
append(unchangedLength - 1);
}
}
void Edits::addReplace(int32_t oldLength, int32_t newLength) {
if(U_FAILURE(errorCode_)) { return; }
if(oldLength < 0 || newLength < 0) {
errorCode_ = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
if (oldLength == 0 && newLength == 0) {
return;
}
++numChanges;
int32_t newDelta = newLength - oldLength;
if (newDelta != 0) {
if ((newDelta > 0 && delta >= 0 && newDelta > (INT32_MAX - delta)) ||
(newDelta < 0 && delta < 0 && newDelta < (INT32_MIN - delta))) {
// Integer overflow or underflow.
errorCode_ = U_INDEX_OUTOFBOUNDS_ERROR;
return;
}
delta += newDelta;
}
if(0 < oldLength && oldLength <= MAX_SHORT_CHANGE_OLD_LENGTH &&
newLength <= MAX_SHORT_CHANGE_NEW_LENGTH) {
// Merge into previous same-lengths short-replacement record, if any.
int32_t u = (oldLength << 12) | (newLength << 9);
int32_t last = lastUnit();
if(MAX_UNCHANGED < last && last < MAX_SHORT_CHANGE &&
(last & ~SHORT_CHANGE_NUM_MASK) == u &&
(last & SHORT_CHANGE_NUM_MASK) < SHORT_CHANGE_NUM_MASK) {
setLastUnit(last + 1);
return;
}
append(u);
return;
}
int32_t head = 0x7000;
if (oldLength < LENGTH_IN_1TRAIL && newLength < LENGTH_IN_1TRAIL) {
head |= oldLength << 6;
head |= newLength;
append(head);
} else if ((capacity - length) >= 5 || growArray()) {
int32_t limit = length + 1;
if(oldLength < LENGTH_IN_1TRAIL) {
head |= oldLength << 6;
} else if(oldLength <= 0x7fff) {
head |= LENGTH_IN_1TRAIL << 6;
array[limit++] = (uint16_t)(0x8000 | oldLength);
} else {
head |= (LENGTH_IN_2TRAIL + (oldLength >> 30)) << 6;
array[limit++] = (uint16_t)(0x8000 | (oldLength >> 15));
array[limit++] = (uint16_t)(0x8000 | oldLength);
}
if(newLength < LENGTH_IN_1TRAIL) {
head |= newLength;
} else if(newLength <= 0x7fff) {
head |= LENGTH_IN_1TRAIL;
array[limit++] = (uint16_t)(0x8000 | newLength);
} else {
head |= LENGTH_IN_2TRAIL + (newLength >> 30);
array[limit++] = (uint16_t)(0x8000 | (newLength >> 15));
array[limit++] = (uint16_t)(0x8000 | newLength);
}
array[length] = (uint16_t)head;
length = limit;
}
}
void Edits::append(int32_t r) {
if(length < capacity || growArray()) {
array[length++] = (uint16_t)r;
}
}
UBool Edits::growArray() {
int32_t newCapacity;
if (array == stackArray) {
newCapacity = 2000;
} else if (capacity == INT32_MAX) {
// Not U_BUFFER_OVERFLOW_ERROR because that could be confused on a string transform API
// with a result-string-buffer overflow.
errorCode_ = U_INDEX_OUTOFBOUNDS_ERROR;
return false;
} else if (capacity >= (INT32_MAX / 2)) {
newCapacity = INT32_MAX;
} else {
newCapacity = 2 * capacity;
}
// Grow by at least 5 units so that a maximal change record will fit.
if ((newCapacity - capacity) < 5) {
errorCode_ = U_INDEX_OUTOFBOUNDS_ERROR;
return false;
}
uint16_t *newArray = (uint16_t *)uprv_malloc((size_t)newCapacity * 2);
if (newArray == nullptr) {
errorCode_ = U_MEMORY_ALLOCATION_ERROR;
return false;
}
uprv_memcpy(newArray, array, (size_t)length * 2);
releaseArray();
array = newArray;
capacity = newCapacity;
return true;
}
UBool Edits::copyErrorTo(UErrorCode &outErrorCode) const {
if (U_FAILURE(outErrorCode)) { return true; }
if (U_SUCCESS(errorCode_)) { return false; }
outErrorCode = errorCode_;
return true;
}
Edits &Edits::mergeAndAppend(const Edits &ab, const Edits &bc, UErrorCode &errorCode) {
if (copyErrorTo(errorCode)) { return *this; }
// Picture string a --(Edits ab)--> string b --(Edits bc)--> string c.
// Parallel iteration over both Edits.
Iterator abIter = ab.getFineIterator();
Iterator bcIter = bc.getFineIterator();
UBool abHasNext = true, bcHasNext = true;
// Copy iterator state into local variables, so that we can modify and subdivide spans.
// ab old & new length, bc old & new length
int32_t aLength = 0, ab_bLength = 0, bc_bLength = 0, cLength = 0;
// When we have different-intermediate-length changes, we accumulate a larger change.
int32_t pending_aLength = 0, pending_cLength = 0;
for (;;) {
// At this point, for each of the two iterators:
// Either we are done with the locally cached current edit,
// and its intermediate-string length has been reset,
// or we will continue to work with a truncated remainder of this edit.
//
// If the current edit is done, and the iterator has not yet reached the end,
// then we fetch the next edit. This is true for at least one of the iterators.
//
// Normally it does not matter whether we fetch from ab and then bc or vice versa.
// However, the result is observably different when
// ab deletions meet bc insertions at the same intermediate-string index.
// Some users expect the bc insertions to come first, so we fetch from bc first.
if (bc_bLength == 0) {
if (bcHasNext && (bcHasNext = bcIter.next(errorCode)) != 0) {
bc_bLength = bcIter.oldLength();
cLength = bcIter.newLength();
if (bc_bLength == 0) {
// insertion
if (ab_bLength == 0 || !abIter.hasChange()) {
addReplace(pending_aLength, pending_cLength + cLength);
pending_aLength = pending_cLength = 0;
} else {
pending_cLength += cLength;
}
continue;
}
}
// else see if the other iterator is done, too.
}
if (ab_bLength == 0) {
if (abHasNext && (abHasNext = abIter.next(errorCode)) != 0) {
aLength = abIter.oldLength();
ab_bLength = abIter.newLength();
if (ab_bLength == 0) {
// deletion
if (bc_bLength == bcIter.oldLength() || !bcIter.hasChange()) {
addReplace(pending_aLength + aLength, pending_cLength);
pending_aLength = pending_cLength = 0;
} else {
pending_aLength += aLength;
}
continue;
}
} else if (bc_bLength == 0) {
// Both iterators are done at the same time:
// The intermediate-string lengths match.
break;
} else {
// The ab output string is shorter than the bc input string.
if (!copyErrorTo(errorCode)) {
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
}
return *this;
}
}
if (bc_bLength == 0) {
// The bc input string is shorter than the ab output string.
if (!copyErrorTo(errorCode)) {
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
}
return *this;
}
// Done fetching: ab_bLength > 0 && bc_bLength > 0
// The current state has two parts:
// - Past: We accumulate a longer ac edit in the "pending" variables.
// - Current: We have copies of the current ab/bc edits in local variables.
// At least one side is newly fetched.
// One side might be a truncated remainder of an edit we fetched earlier.
if (!abIter.hasChange() && !bcIter.hasChange()) {
// An unchanged span all the way from string a to string c.
if (pending_aLength != 0 || pending_cLength != 0) {
addReplace(pending_aLength, pending_cLength);
pending_aLength = pending_cLength = 0;
}
int32_t unchangedLength = aLength <= cLength ? aLength : cLength;
addUnchanged(unchangedLength);
ab_bLength = aLength -= unchangedLength;
bc_bLength = cLength -= unchangedLength;
// At least one of the unchanged spans is now empty.
continue;
}
if (!abIter.hasChange() && bcIter.hasChange()) {
// Unchanged a->b but changed b->c.
if (ab_bLength >= bc_bLength) {
// Split the longer unchanged span into change + remainder.
addReplace(pending_aLength + bc_bLength, pending_cLength + cLength);
pending_aLength = pending_cLength = 0;
aLength = ab_bLength -= bc_bLength;
bc_bLength = 0;
continue;
}
// Handle the shorter unchanged span below like a change.
} else if (abIter.hasChange() && !bcIter.hasChange()) {
// Changed a->b and then unchanged b->c.
if (ab_bLength <= bc_bLength) {
// Split the longer unchanged span into change + remainder.
addReplace(pending_aLength + aLength, pending_cLength + ab_bLength);
pending_aLength = pending_cLength = 0;
cLength = bc_bLength -= ab_bLength;
ab_bLength = 0;
continue;
}
// Handle the shorter unchanged span below like a change.
} else { // both abIter.hasChange() && bcIter.hasChange()
if (ab_bLength == bc_bLength) {
// Changes on both sides up to the same position. Emit & reset.
addReplace(pending_aLength + aLength, pending_cLength + cLength);
pending_aLength = pending_cLength = 0;
ab_bLength = bc_bLength = 0;
continue;
}
}
// Accumulate the a->c change, reset the shorter side,
// keep a remainder of the longer one.
pending_aLength += aLength;
pending_cLength += cLength;
if (ab_bLength < bc_bLength) {
bc_bLength -= ab_bLength;
cLength = ab_bLength = 0;
} else { // ab_bLength > bc_bLength
ab_bLength -= bc_bLength;
aLength = bc_bLength = 0;
}
}
if (pending_aLength != 0 || pending_cLength != 0) {
addReplace(pending_aLength, pending_cLength);
}
copyErrorTo(errorCode);
return *this;
}
Edits::Iterator::Iterator(const uint16_t *a, int32_t len, UBool oc, UBool crs) :
array(a), index(0), length(len), remaining(0),
onlyChanges_(oc), coarse(crs),
dir(0), changed(false), oldLength_(0), newLength_(0),
srcIndex(0), replIndex(0), destIndex(0) {}
int32_t Edits::Iterator::readLength(int32_t head) {
if (head < LENGTH_IN_1TRAIL) {
return head;
} else if (head < LENGTH_IN_2TRAIL) {
U_ASSERT(index < length);
U_ASSERT(array[index] >= 0x8000);
return array[index++] & 0x7fff;
} else {
U_ASSERT((index + 2) <= length);
U_ASSERT(array[index] >= 0x8000);
U_ASSERT(array[index + 1] >= 0x8000);
int32_t len = ((head & 1) << 30) |
((int32_t)(array[index] & 0x7fff) << 15) |
(array[index + 1] & 0x7fff);
index += 2;
return len;
}
}
void Edits::Iterator::updateNextIndexes() {
srcIndex += oldLength_;
if (changed) {
replIndex += newLength_;
}
destIndex += newLength_;
}
void Edits::Iterator::updatePreviousIndexes() {
srcIndex -= oldLength_;
if (changed) {
replIndex -= newLength_;
}
destIndex -= newLength_;
}
UBool Edits::Iterator::noNext() {
// No change before or beyond the string.
dir = 0;
changed = false;
oldLength_ = newLength_ = 0;
return false;
}
UBool Edits::Iterator::next(UBool onlyChanges, UErrorCode &errorCode) {
// Forward iteration: Update the string indexes to the limit of the current span,
// and post-increment-read array units to assemble a new span.
// Leaves the array index one after the last unit of that span.
if (U_FAILURE(errorCode)) { return false; }
// We have an errorCode in case we need to start guarding against integer overflows.
// It is also convenient for caller loops if we bail out when an error was set elsewhere.
if (dir > 0) {
updateNextIndexes();
} else {
if (dir < 0) {
// Turn around from previous() to next().
// Post-increment-read the same span again.
if (remaining > 0) {
// Fine-grained iterator:
// Stay on the current one of a sequence of compressed changes.
++index; // next() rests on the index after the sequence unit.
dir = 1;
return true;
}
}
dir = 1;
}
if (remaining >= 1) {
// Fine-grained iterator: Continue a sequence of compressed changes.
if (remaining > 1) {
--remaining;
return true;
}
remaining = 0;
}
if (index >= length) {
return noNext();
}
int32_t u = array[index++];
if (u <= MAX_UNCHANGED) {
// Combine adjacent unchanged ranges.
changed = false;
oldLength_ = u + 1;
while (index < length && (u = array[index]) <= MAX_UNCHANGED) {
++index;
oldLength_ += u + 1;
}
newLength_ = oldLength_;
if (onlyChanges) {
updateNextIndexes();
if (index >= length) {
return noNext();
}
// already fetched u > MAX_UNCHANGED at index
++index;
} else {
return true;
}
}
changed = true;
if (u <= MAX_SHORT_CHANGE) {
int32_t oldLen = u >> 12;
int32_t newLen = (u >> 9) & MAX_SHORT_CHANGE_NEW_LENGTH;
int32_t num = (u & SHORT_CHANGE_NUM_MASK) + 1;
if (coarse) {
oldLength_ = num * oldLen;
newLength_ = num * newLen;
} else {
// Split a sequence of changes that was compressed into one unit.
oldLength_ = oldLen;
newLength_ = newLen;
if (num > 1) {
remaining = num; // This is the first of two or more changes.
}
return true;
}
} else {
U_ASSERT(u <= 0x7fff);
oldLength_ = readLength((u >> 6) & 0x3f);
newLength_ = readLength(u & 0x3f);
if (!coarse) {
return true;
}
}
// Combine adjacent changes.
while (index < length && (u = array[index]) > MAX_UNCHANGED) {
++index;
if (u <= MAX_SHORT_CHANGE) {
int32_t num = (u & SHORT_CHANGE_NUM_MASK) + 1;
oldLength_ += (u >> 12) * num;
newLength_ += ((u >> 9) & MAX_SHORT_CHANGE_NEW_LENGTH) * num;
} else {
U_ASSERT(u <= 0x7fff);
oldLength_ += readLength((u >> 6) & 0x3f);
newLength_ += readLength(u & 0x3f);
}
}
return true;
}
UBool Edits::Iterator::previous(UErrorCode &errorCode) {
// Backward iteration: Pre-decrement-read array units to assemble a new span,
// then update the string indexes to the start of that span.
// Leaves the array index on the head unit of that span.
if (U_FAILURE(errorCode)) { return false; }
// We have an errorCode in case we need to start guarding against integer overflows.
// It is also convenient for caller loops if we bail out when an error was set elsewhere.
if (dir >= 0) {
if (dir > 0) {
// Turn around from next() to previous().
// Set the string indexes to the span limit and
// pre-decrement-read the same span again.
if (remaining > 0) {
// Fine-grained iterator:
// Stay on the current one of a sequence of compressed changes.
--index; // previous() rests on the sequence unit.
dir = -1;
return true;
}
updateNextIndexes();
}
dir = -1;
}
if (remaining > 0) {
// Fine-grained iterator: Continue a sequence of compressed changes.
int32_t u = array[index];
U_ASSERT(MAX_UNCHANGED < u && u <= MAX_SHORT_CHANGE);
if (remaining <= (u & SHORT_CHANGE_NUM_MASK)) {
++remaining;
updatePreviousIndexes();
return true;
}
remaining = 0;
}
if (index <= 0) {
return noNext();
}
int32_t u = array[--index];
if (u <= MAX_UNCHANGED) {
// Combine adjacent unchanged ranges.
changed = false;
oldLength_ = u + 1;
while (index > 0 && (u = array[index - 1]) <= MAX_UNCHANGED) {
--index;
oldLength_ += u + 1;
}
newLength_ = oldLength_;
// No need to handle onlyChanges as long as previous() is called only from findIndex().
updatePreviousIndexes();
return true;
}
changed = true;
if (u <= MAX_SHORT_CHANGE) {
int32_t oldLen = u >> 12;
int32_t newLen = (u >> 9) & MAX_SHORT_CHANGE_NEW_LENGTH;
int32_t num = (u & SHORT_CHANGE_NUM_MASK) + 1;
if (coarse) {
oldLength_ = num * oldLen;
newLength_ = num * newLen;
} else {
// Split a sequence of changes that was compressed into one unit.
oldLength_ = oldLen;
newLength_ = newLen;
if (num > 1) {
remaining = 1; // This is the last of two or more changes.
}
updatePreviousIndexes();
return true;
}
} else {
if (u <= 0x7fff) {
// The change is encoded in u alone.
oldLength_ = readLength((u >> 6) & 0x3f);
newLength_ = readLength(u & 0x3f);
} else {
// Back up to the head of the change, read the lengths,
// and reset the index to the head again.
U_ASSERT(index > 0);
while ((u = array[--index]) > 0x7fff) {}
U_ASSERT(u > MAX_SHORT_CHANGE);
int32_t headIndex = index++;
oldLength_ = readLength((u >> 6) & 0x3f);
newLength_ = readLength(u & 0x3f);
index = headIndex;
}
if (!coarse) {
updatePreviousIndexes();
return true;
}
}
// Combine adjacent changes.
while (index > 0 && (u = array[index - 1]) > MAX_UNCHANGED) {
--index;
if (u <= MAX_SHORT_CHANGE) {
int32_t num = (u & SHORT_CHANGE_NUM_MASK) + 1;
oldLength_ += (u >> 12) * num;
newLength_ += ((u >> 9) & MAX_SHORT_CHANGE_NEW_LENGTH) * num;
} else if (u <= 0x7fff) {
// Read the lengths, and reset the index to the head again.
int32_t headIndex = index++;
oldLength_ += readLength((u >> 6) & 0x3f);
newLength_ += readLength(u & 0x3f);
index = headIndex;
}
}
updatePreviousIndexes();
return true;
}
int32_t Edits::Iterator::findIndex(int32_t i, UBool findSource, UErrorCode &errorCode) {
if (U_FAILURE(errorCode) || i < 0) { return -1; }
int32_t spanStart, spanLength;
if (findSource) { // find source index
spanStart = srcIndex;
spanLength = oldLength_;
} else { // find destination index
spanStart = destIndex;
spanLength = newLength_;
}
if (i < spanStart) {
if (i >= (spanStart / 2)) {
// Search backwards.
for (;;) {
UBool hasPrevious = previous(errorCode);
U_ASSERT(hasPrevious); // because i>=0 and the first span starts at 0
(void)hasPrevious; // avoid unused-variable warning
spanStart = findSource ? srcIndex : destIndex;
if (i >= spanStart) {
// The index is in the current span.
return 0;
}
if (remaining > 0) {
// Is the index in one of the remaining compressed edits?
// spanStart is the start of the current span, first of the remaining ones.
spanLength = findSource ? oldLength_ : newLength_;
int32_t u = array[index];
U_ASSERT(MAX_UNCHANGED < u && u <= MAX_SHORT_CHANGE);
int32_t num = (u & SHORT_CHANGE_NUM_MASK) + 1 - remaining;
int32_t len = num * spanLength;
if (i >= (spanStart - len)) {
int32_t n = ((spanStart - i - 1) / spanLength) + 1;
// 1 <= n <= num
srcIndex -= n * oldLength_;
replIndex -= n * newLength_;
destIndex -= n * newLength_;
remaining += n;
return 0;
}
// Skip all of these edits at once.
srcIndex -= num * oldLength_;
replIndex -= num * newLength_;
destIndex -= num * newLength_;
remaining = 0;
}
}
}
// Reset the iterator to the start.
dir = 0;
index = remaining = oldLength_ = newLength_ = srcIndex = replIndex = destIndex = 0;
} else if (i < (spanStart + spanLength)) {
// The index is in the current span.
return 0;
}
while (next(false, errorCode)) {
if (findSource) {
spanStart = srcIndex;
spanLength = oldLength_;
} else {
spanStart = destIndex;
spanLength = newLength_;
}
if (i < (spanStart + spanLength)) {
// The index is in the current span.
return 0;
}
if (remaining > 1) {
// Is the index in one of the remaining compressed edits?
// spanStart is the start of the current span, first of the remaining ones.
int32_t len = remaining * spanLength;
if (i < (spanStart + len)) {
int32_t n = (i - spanStart) / spanLength; // 1 <= n <= remaining - 1
srcIndex += n * oldLength_;
replIndex += n * newLength_;
destIndex += n * newLength_;
remaining -= n;
return 0;
}
// Make next() skip all of these edits at once.
oldLength_ *= remaining;
newLength_ *= remaining;
remaining = 0;
}
}
return 1;
}
int32_t Edits::Iterator::destinationIndexFromSourceIndex(int32_t i, UErrorCode &errorCode) {
int32_t where = findIndex(i, true, errorCode);
if (where < 0) {
// Error or before the string.
return 0;
}
if (where > 0 || i == srcIndex) {
// At or after string length, or at start of the found span.
return destIndex;
}
if (changed) {
// In a change span, map to its end.
return destIndex + newLength_;
} else {
// In an unchanged span, offset 1:1 within it.
return destIndex + (i - srcIndex);
}
}
int32_t Edits::Iterator::sourceIndexFromDestinationIndex(int32_t i, UErrorCode &errorCode) {
int32_t where = findIndex(i, false, errorCode);
if (where < 0) {
// Error or before the string.
return 0;
}
if (where > 0 || i == destIndex) {
// At or after string length, or at start of the found span.
return srcIndex;
}
if (changed) {
// In a change span, map to its end.
return srcIndex + oldLength_;
} else {
// In an unchanged span, offset within it.
return srcIndex + (i - destIndex);
}
}
UnicodeString& Edits::Iterator::toString(UnicodeString& sb) const {
sb.append(u"{ src[", -1);
ICU_Utility::appendNumber(sb, srcIndex);
sb.append(u"..", -1);
ICU_Utility::appendNumber(sb, srcIndex + oldLength_);
if (changed) {
sb.append(u"] ⇝ dest[", -1);
} else {
sb.append(u"] ≡ dest[", -1);
}
ICU_Utility::appendNumber(sb, destIndex);
sb.append(u"..", -1);
ICU_Utility::appendNumber(sb, destIndex + newLength_);
if (changed) {
sb.append(u"], repl[", -1);
ICU_Utility::appendNumber(sb, replIndex);
sb.append(u"..", -1);
ICU_Utility::appendNumber(sb, replIndex + newLength_);
sb.append(u"] }", -1);
} else {
sb.append(u"] (no-change) }", -1);
}
return sb;
}
U_NAMESPACE_END

View file

@ -0,0 +1,220 @@
// © 2021 and later: Unicode, Inc. and others.
// License & terms of use: https://www.unicode.org/copyright.html
// emojiprops.cpp
// created: 2021sep04 Markus W. Scherer
#include "unicode/utypes.h"
#include "unicode/uchar.h"
#include "unicode/ucharstrie.h"
#include "unicode/ucptrie.h"
#include "unicode/udata.h"
#include "unicode/ustringtrie.h"
#include "unicode/utf16.h"
#include "emojiprops.h"
#include "ucln.h"
#include "ucln_cmn.h"
#include "umutex.h"
#include "uset_imp.h"
U_NAMESPACE_BEGIN
namespace {
EmojiProps *singleton = nullptr;
icu::UInitOnce emojiInitOnce {};
UBool U_CALLCONV emojiprops_cleanup() {
delete singleton;
singleton = nullptr;
emojiInitOnce.reset();
return true;
}
void U_CALLCONV initSingleton(UErrorCode &errorCode) {
if (U_FAILURE(errorCode)) { return; }
singleton = new EmojiProps(errorCode);
if (singleton == nullptr) {
errorCode = U_MEMORY_ALLOCATION_ERROR;
} else if (U_FAILURE(errorCode)) {
delete singleton;
singleton = nullptr;
}
ucln_common_registerCleanup(UCLN_COMMON_EMOJIPROPS, emojiprops_cleanup);
}
// TODO: turn this into a shared helper function
// Requires the major version to match, and then requires at least the minor version.
UBool udata_isAcceptableMajorMinor(
const UDataInfo &info, const char16_t *dataFormat, uint8_t major, uint8_t minor) {
return
info.size >= 20 &&
info.isBigEndian == U_IS_BIG_ENDIAN &&
info.charsetFamily == U_CHARSET_FAMILY &&
info.dataFormat[0] == dataFormat[0] &&
info.dataFormat[1] == dataFormat[1] &&
info.dataFormat[2] == dataFormat[2] &&
info.dataFormat[3] == dataFormat[3] &&
info.formatVersion[0] == major &&
info.formatVersion[1] >= minor;
}
} // namespace
EmojiProps::~EmojiProps() {
udata_close(memory);
ucptrie_close(cpTrie);
}
const EmojiProps *
EmojiProps::getSingleton(UErrorCode &errorCode) {
if (U_FAILURE(errorCode)) { return nullptr; }
umtx_initOnce(emojiInitOnce, &initSingleton, errorCode);
return singleton;
}
UBool U_CALLCONV
EmojiProps::isAcceptable(void * /*context*/, const char * /*type*/, const char * /*name*/,
const UDataInfo *pInfo) {
return udata_isAcceptableMajorMinor(*pInfo, u"Emoj", 1, 0);
}
void
EmojiProps::load(UErrorCode &errorCode) {
memory = udata_openChoice(nullptr, "icu", "uemoji", isAcceptable, this, &errorCode);
if (U_FAILURE(errorCode)) { return; }
const uint8_t *inBytes = (const uint8_t *)udata_getMemory(memory);
const int32_t *inIndexes = (const int32_t *)inBytes;
int32_t indexesLength = inIndexes[IX_CPTRIE_OFFSET] / 4;
if (indexesLength <= IX_RGI_EMOJI_ZWJ_SEQUENCE_TRIE_OFFSET) {
errorCode = U_INVALID_FORMAT_ERROR; // Not enough indexes.
return;
}
int32_t i = IX_CPTRIE_OFFSET;
int32_t offset = inIndexes[i++];
int32_t nextOffset = inIndexes[i];
cpTrie = ucptrie_openFromBinary(UCPTRIE_TYPE_FAST, UCPTRIE_VALUE_BITS_8,
inBytes + offset, nextOffset - offset, nullptr, &errorCode);
if (U_FAILURE(errorCode)) {
return;
}
for (i = IX_BASIC_EMOJI_TRIE_OFFSET; i <= IX_RGI_EMOJI_ZWJ_SEQUENCE_TRIE_OFFSET; ++i) {
offset = inIndexes[i];
nextOffset = inIndexes[i + 1];
// Set/leave nullptr if there is no UCharsTrie.
const char16_t *p = nextOffset > offset ? (const char16_t *)(inBytes + offset) : nullptr;
stringTries[getStringTrieIndex(i)] = p;
}
}
void
EmojiProps::addPropertyStarts(const USetAdder *sa, UErrorCode & /*errorCode*/) const {
// Add the start code point of each same-value range of the trie.
UChar32 start = 0, end;
uint32_t value;
while ((end = ucptrie_getRange(cpTrie, start, UCPMAP_RANGE_NORMAL, 0,
nullptr, nullptr, &value)) >= 0) {
sa->add(sa->set, start);
start = end + 1;
}
}
UBool
EmojiProps::hasBinaryProperty(UChar32 c, UProperty which) {
UErrorCode errorCode = U_ZERO_ERROR;
const EmojiProps *ep = getSingleton(errorCode);
return U_SUCCESS(errorCode) && ep->hasBinaryPropertyImpl(c, which);
}
UBool
EmojiProps::hasBinaryPropertyImpl(UChar32 c, UProperty which) const {
if (which < UCHAR_EMOJI || UCHAR_RGI_EMOJI < which) {
return false;
}
// Note: UCHAR_REGIONAL_INDICATOR is a single, hardcoded range implemented elsewhere.
static constexpr int8_t bitFlags[] = {
BIT_EMOJI, // UCHAR_EMOJI=57
BIT_EMOJI_PRESENTATION, // UCHAR_EMOJI_PRESENTATION=58
BIT_EMOJI_MODIFIER, // UCHAR_EMOJI_MODIFIER=59
BIT_EMOJI_MODIFIER_BASE, // UCHAR_EMOJI_MODIFIER_BASE=60
BIT_EMOJI_COMPONENT, // UCHAR_EMOJI_COMPONENT=61
-1, // UCHAR_REGIONAL_INDICATOR=62
-1, // UCHAR_PREPENDED_CONCATENATION_MARK=63
BIT_EXTENDED_PICTOGRAPHIC, // UCHAR_EXTENDED_PICTOGRAPHIC=64
BIT_BASIC_EMOJI, // UCHAR_BASIC_EMOJI=65
-1, // UCHAR_EMOJI_KEYCAP_SEQUENCE=66
-1, // UCHAR_RGI_EMOJI_MODIFIER_SEQUENCE=67
-1, // UCHAR_RGI_EMOJI_FLAG_SEQUENCE=68
-1, // UCHAR_RGI_EMOJI_TAG_SEQUENCE=69
-1, // UCHAR_RGI_EMOJI_ZWJ_SEQUENCE=70
BIT_BASIC_EMOJI, // UCHAR_RGI_EMOJI=71
};
int32_t bit = bitFlags[which - UCHAR_EMOJI];
if (bit < 0) {
return false; // not a property that we support in this function
}
uint8_t bits = UCPTRIE_FAST_GET(cpTrie, UCPTRIE_8, c);
return (bits >> bit) & 1;
}
UBool
EmojiProps::hasBinaryProperty(const char16_t *s, int32_t length, UProperty which) {
UErrorCode errorCode = U_ZERO_ERROR;
const EmojiProps *ep = getSingleton(errorCode);
return U_SUCCESS(errorCode) && ep->hasBinaryPropertyImpl(s, length, which);
}
UBool
EmojiProps::hasBinaryPropertyImpl(const char16_t *s, int32_t length, UProperty which) const {
if (s == nullptr && length != 0) { return false; }
if (length <= 0 && (length == 0 || *s == 0)) { return false; } // empty string
// The caller should have delegated single code points to hasBinaryProperty(c, which).
if (which < UCHAR_BASIC_EMOJI || UCHAR_RGI_EMOJI < which) {
return false;
}
UProperty firstProp = which, lastProp = which;
if (which == UCHAR_RGI_EMOJI) {
// RGI_Emoji is the union of the other emoji properties of strings.
firstProp = UCHAR_BASIC_EMOJI;
lastProp = UCHAR_RGI_EMOJI_ZWJ_SEQUENCE;
}
for (int32_t prop = firstProp; prop <= lastProp; ++prop) {
const char16_t *trieUChars = stringTries[prop - UCHAR_BASIC_EMOJI];
if (trieUChars != nullptr) {
UCharsTrie trie(trieUChars);
UStringTrieResult result = trie.next(s, length);
if (USTRINGTRIE_HAS_VALUE(result)) {
return true;
}
}
}
return false;
}
void
EmojiProps::addStrings(const USetAdder *sa, UProperty which, UErrorCode &errorCode) const {
if (U_FAILURE(errorCode)) { return; }
if (which < UCHAR_BASIC_EMOJI || UCHAR_RGI_EMOJI < which) {
return;
}
UProperty firstProp = which, lastProp = which;
if (which == UCHAR_RGI_EMOJI) {
// RGI_Emoji is the union of the other emoji properties of strings.
firstProp = UCHAR_BASIC_EMOJI;
lastProp = UCHAR_RGI_EMOJI_ZWJ_SEQUENCE;
}
for (int32_t prop = firstProp; prop <= lastProp; ++prop) {
const char16_t *trieUChars = stringTries[prop - UCHAR_BASIC_EMOJI];
if (trieUChars != nullptr) {
UCharsTrie::Iterator iter(trieUChars, 0, errorCode);
while (iter.next(errorCode)) {
const UnicodeString &s = iter.getString();
sa->addString(sa->set, s.getBuffer(), s.length());
}
}
}
}
U_NAMESPACE_END

View file

@ -0,0 +1,90 @@
// © 2021 and later: Unicode, Inc. and others.
// License & terms of use: https://www.unicode.org/copyright.html
// emojiprops.h
// created: 2021sep03 Markus W. Scherer
#ifndef __EMOJIPROPS_H__
#define __EMOJIPROPS_H__
#include "unicode/utypes.h"
#include "unicode/ucptrie.h"
#include "unicode/udata.h"
#include "unicode/uobject.h"
#include "uset_imp.h"
U_NAMESPACE_BEGIN
class EmojiProps : public UMemory {
public:
// @internal
EmojiProps(UErrorCode &errorCode) { load(errorCode); }
~EmojiProps();
static const EmojiProps *getSingleton(UErrorCode &errorCode);
static UBool hasBinaryProperty(UChar32 c, UProperty which);
static UBool hasBinaryProperty(const char16_t *s, int32_t length, UProperty which);
void addPropertyStarts(const USetAdder *sa, UErrorCode &errorCode) const;
void addStrings(const USetAdder *sa, UProperty which, UErrorCode &errorCode) const;
enum {
// Byte offsets from the start of the data, after the generic header,
// in ascending order.
// UCPTrie=CodePointTrie, follows the indexes
IX_CPTRIE_OFFSET,
IX_RESERVED1,
IX_RESERVED2,
IX_RESERVED3,
// UCharsTrie=CharsTrie
IX_BASIC_EMOJI_TRIE_OFFSET,
IX_EMOJI_KEYCAP_SEQUENCE_TRIE_OFFSET,
IX_RGI_EMOJI_MODIFIER_SEQUENCE_TRIE_OFFSET,
IX_RGI_EMOJI_FLAG_SEQUENCE_TRIE_OFFSET,
IX_RGI_EMOJI_TAG_SEQUENCE_TRIE_OFFSET,
IX_RGI_EMOJI_ZWJ_SEQUENCE_TRIE_OFFSET,
IX_RESERVED10,
IX_RESERVED11,
IX_RESERVED12,
IX_TOTAL_SIZE,
// Not initially byte offsets.
IX_RESERVED14,
IX_RESERVED15,
IX_COUNT // 16
};
// Properties in the code point trie.
enum {
// https://www.unicode.org/reports/tr51/#Emoji_Properties
BIT_EMOJI,
BIT_EMOJI_PRESENTATION,
BIT_EMOJI_MODIFIER,
BIT_EMOJI_MODIFIER_BASE,
BIT_EMOJI_COMPONENT,
BIT_EXTENDED_PICTOGRAPHIC,
// https://www.unicode.org/reports/tr51/#Emoji_Sets
BIT_BASIC_EMOJI
};
private:
static UBool U_CALLCONV
isAcceptable(void *context, const char *type, const char *name, const UDataInfo *pInfo);
/** Input i: One of the IX_..._TRIE_OFFSET indexes into the data file indexes[] array. */
static int32_t getStringTrieIndex(int32_t i) {
return i - IX_BASIC_EMOJI_TRIE_OFFSET;
}
void load(UErrorCode &errorCode);
UBool hasBinaryPropertyImpl(UChar32 c, UProperty which) const;
UBool hasBinaryPropertyImpl(const char16_t *s, int32_t length, UProperty which) const;
UDataMemory *memory = nullptr;
UCPTrie *cpTrie = nullptr;
const char16_t *stringTries[6] = { nullptr, nullptr, nullptr, nullptr, nullptr, nullptr };
};
U_NAMESPACE_END
#endif // __EMOJIPROPS_H__

View file

@ -0,0 +1,42 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
*
* Copyright (C) 2009-2011, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: errorcode.cpp
* encoding: UTF-8
* tab size: 8 (not used)
* indentation:4
*
* created on: 2009mar10
* created by: Markus W. Scherer
*/
#include "unicode/utypes.h"
#include "unicode/errorcode.h"
U_NAMESPACE_BEGIN
ErrorCode::~ErrorCode() {}
UErrorCode ErrorCode::reset() {
UErrorCode code = errorCode;
errorCode = U_ZERO_ERROR;
return code;
}
void ErrorCode::assertSuccess() const {
if(isFailure()) {
handleFailure();
}
}
const char* ErrorCode::errorName() const {
return u_errorName(errorCode);
}
U_NAMESPACE_END

View file

@ -0,0 +1,736 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
* Copyright (C) 2014-2015, International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/
#include "unicode/utypes.h"
#if !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_FILTERED_BREAK_ITERATION
#include "cmemory.h"
#include "unicode/filteredbrk.h"
#include "unicode/ucharstriebuilder.h"
#include "unicode/ures.h"
#include "uresimp.h" // ures_getByKeyWithFallback
#include "ubrkimpl.h" // U_ICUDATA_BRKITR
#include "uvector.h"
#include "cmemory.h"
#include "umutex.h"
U_NAMESPACE_BEGIN
#ifndef FB_DEBUG
#define FB_DEBUG 0
#endif
#if FB_DEBUG
#include <stdio.h>
static void _fb_trace(const char *m, const UnicodeString *s, UBool b, int32_t d, const char *f, int l) {
char buf[2048];
if(s) {
s->extract(0,s->length(),buf,2048);
} else {
strcpy(buf,"nullptr");
}
fprintf(stderr,"%s:%d: %s. s='%s'(%p), b=%c, d=%d\n",
f, l, m, buf, (const void*)s, b?'T':'F',(int)d);
}
#define FB_TRACE(m,s,b,d) _fb_trace(m,s,b,d,__FILE__,__LINE__)
#else
#define FB_TRACE(m,s,b,d)
#endif
/**
* Used with sortedInsert()
*/
static int32_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) {
const UnicodeString &a = *(const UnicodeString*)t1.pointer;
const UnicodeString &b = *(const UnicodeString*)t2.pointer;
return a.compare(b);
}
/**
* A UVector which implements a set of strings.
*/
class UStringSet : public UVector {
public:
UStringSet(UErrorCode &status) : UVector(uprv_deleteUObject,
uhash_compareUnicodeString,
1,
status) {}
virtual ~UStringSet();
/**
* Is this UnicodeSet contained?
*/
inline UBool contains(const UnicodeString& s) {
return contains((void*) &s);
}
using UVector::contains;
/**
* Return the ith UnicodeString alias
*/
inline const UnicodeString* getStringAt(int32_t i) const {
return (const UnicodeString*)elementAt(i);
}
/**
* Adopt the UnicodeString if not already contained.
* Caller no longer owns the pointer in any case.
* @return true if adopted successfully, false otherwise (error, or else duplicate)
*/
inline UBool adopt(UnicodeString *str, UErrorCode &status) {
if(U_FAILURE(status) || contains(*str)) {
delete str;
return false;
} else {
sortedInsert(str, compareUnicodeString, status);
if(U_FAILURE(status)) {
return false;
}
return true;
}
}
/**
* Add by value.
* @return true if successfully adopted.
*/
inline UBool add(const UnicodeString& str, UErrorCode &status) {
if(U_FAILURE(status)) return false;
UnicodeString *t = new UnicodeString(str);
if(t==nullptr) {
status = U_MEMORY_ALLOCATION_ERROR; return false;
}
return adopt(t, status);
}
/**
* Remove this string.
* @return true if successfully removed, false otherwise (error, or else it wasn't there)
*/
inline UBool remove(const UnicodeString &s, UErrorCode &status) {
if(U_FAILURE(status)) return false;
return removeElement((void*) &s);
}
};
/**
* Virtual, won't be inlined
*/
UStringSet::~UStringSet() {}
/* ----------------------------------------------------------- */
/* Filtered Break constants */
static const int32_t kPARTIAL = (1<<0); //< partial - need to run through forward trie
static const int32_t kMATCH = (1<<1); //< exact match - skip this one.
static const int32_t kSuppressInReverse = (1<<0);
static const int32_t kAddToForward = (1<<1);
static const char16_t kFULLSTOP = 0x002E; // '.'
/**
* Shared data for SimpleFilteredSentenceBreakIterator
*/
class SimpleFilteredSentenceBreakData : public UMemory {
public:
SimpleFilteredSentenceBreakData(UCharsTrie *forwards, UCharsTrie *backwards )
: fForwardsPartialTrie(forwards), fBackwardsTrie(backwards), refcount(1) { }
SimpleFilteredSentenceBreakData *incr() {
umtx_atomic_inc(&refcount);
return this;
}
SimpleFilteredSentenceBreakData *decr() {
if(umtx_atomic_dec(&refcount) <= 0) {
delete this;
}
return nullptr;
}
virtual ~SimpleFilteredSentenceBreakData();
bool hasForwardsPartialTrie() const { return fForwardsPartialTrie.isValid(); }
bool hasBackwardsTrie() const { return fBackwardsTrie.isValid(); }
const UCharsTrie &getForwardsPartialTrie() const { return *fForwardsPartialTrie; }
const UCharsTrie &getBackwardsTrie() const { return *fBackwardsTrie; }
private:
// These tries own their data arrays.
// They are shared and must therefore not be modified.
LocalPointer<UCharsTrie> fForwardsPartialTrie; // Has ".a" for "a.M."
LocalPointer<UCharsTrie> fBackwardsTrie; // i.e. ".srM" for Mrs.
u_atomic_int32_t refcount;
};
SimpleFilteredSentenceBreakData::~SimpleFilteredSentenceBreakData() {}
/**
* Concrete implementation
*/
class SimpleFilteredSentenceBreakIterator : public BreakIterator {
public:
SimpleFilteredSentenceBreakIterator(BreakIterator *adopt, UCharsTrie *forwards, UCharsTrie *backwards, UErrorCode &status);
SimpleFilteredSentenceBreakIterator(const SimpleFilteredSentenceBreakIterator& other);
virtual ~SimpleFilteredSentenceBreakIterator();
private:
SimpleFilteredSentenceBreakData *fData;
LocalPointer<BreakIterator> fDelegate;
LocalUTextPointer fText;
/* -- subclass interface -- */
public:
/* -- cloning and other subclass stuff -- */
virtual BreakIterator * createBufferClone(void * /*stackBuffer*/,
int32_t &/*BufferSize*/,
UErrorCode &status) override {
// for now - always deep clone
status = U_SAFECLONE_ALLOCATED_WARNING;
return clone();
}
virtual SimpleFilteredSentenceBreakIterator* clone() const override { return new SimpleFilteredSentenceBreakIterator(*this); }
virtual UClassID getDynamicClassID() const override { return nullptr; }
virtual bool operator==(const BreakIterator& o) const override { if(this==&o) return true; return false; }
/* -- text modifying -- */
virtual void setText(UText *text, UErrorCode &status) override { fDelegate->setText(text,status); }
virtual BreakIterator &refreshInputText(UText *input, UErrorCode &status) override { fDelegate->refreshInputText(input,status); return *this; }
virtual void adoptText(CharacterIterator* it) override { fDelegate->adoptText(it); }
virtual void setText(const UnicodeString &text) override { fDelegate->setText(text); }
/* -- other functions that are just delegated -- */
virtual UText *getUText(UText *fillIn, UErrorCode &status) const override { return fDelegate->getUText(fillIn,status); }
virtual CharacterIterator& getText() const override { return fDelegate->getText(); }
/* -- ITERATION -- */
virtual int32_t first() override;
virtual int32_t preceding(int32_t offset) override;
virtual int32_t previous() override;
virtual UBool isBoundary(int32_t offset) override;
virtual int32_t current() const override { return fDelegate->current(); } // we keep the delegate current, so this should be correct.
virtual int32_t next() override;
virtual int32_t next(int32_t n) override;
virtual int32_t following(int32_t offset) override;
virtual int32_t last() override;
private:
/**
* Given that the fDelegate has already given its "initial" answer,
* find the NEXT actual (non-excepted) break.
* @param n initial position from delegate
* @return new break position or UBRK_DONE
*/
int32_t internalNext(int32_t n);
/**
* Given that the fDelegate has already given its "initial" answer,
* find the PREV actual (non-excepted) break.
* @param n initial position from delegate
* @return new break position or UBRK_DONE
*/
int32_t internalPrev(int32_t n);
/**
* set up the UText with the value of the fDelegate.
* Call this before calling breakExceptionAt.
* May be able to avoid excess calls
*/
void resetState(UErrorCode &status);
/**
* Is there a match (exception) at this spot?
*/
enum EFBMatchResult { kNoExceptionHere, kExceptionHere };
/**
* Determine if there is an exception at this spot
* @param n spot to check
* @return kNoExceptionHere or kExceptionHere
**/
enum EFBMatchResult breakExceptionAt(int32_t n);
};
SimpleFilteredSentenceBreakIterator::SimpleFilteredSentenceBreakIterator(const SimpleFilteredSentenceBreakIterator& other)
: BreakIterator(other), fData(other.fData->incr()), fDelegate(other.fDelegate->clone())
{
}
SimpleFilteredSentenceBreakIterator::SimpleFilteredSentenceBreakIterator(BreakIterator *adopt, UCharsTrie *forwards, UCharsTrie *backwards, UErrorCode &status) :
BreakIterator(adopt->getLocale(ULOC_VALID_LOCALE,status),adopt->getLocale(ULOC_ACTUAL_LOCALE,status)),
fData(new SimpleFilteredSentenceBreakData(forwards, backwards)),
fDelegate(adopt)
{
if (fData == nullptr) {
delete forwards;
delete backwards;
if (U_SUCCESS(status)) {
status = U_MEMORY_ALLOCATION_ERROR;
}
}
}
SimpleFilteredSentenceBreakIterator::~SimpleFilteredSentenceBreakIterator() {
fData = fData->decr();
}
void SimpleFilteredSentenceBreakIterator::resetState(UErrorCode &status) {
fText.adoptInstead(fDelegate->getUText(fText.orphan(), status));
}
SimpleFilteredSentenceBreakIterator::EFBMatchResult
SimpleFilteredSentenceBreakIterator::breakExceptionAt(int32_t n) {
int64_t bestPosn = -1;
int32_t bestValue = -1;
// loops while 'n' points to an exception.
utext_setNativeIndex(fText.getAlias(), n); // from n..
//if(debug2) u_printf(" n@ %d\n", n);
// Assume a space is following the '.' (so we handle the case: "Mr. /Brown")
if(utext_previous32(fText.getAlias())==u' ') { // TODO: skip a class of chars here??
// TODO only do this the 1st time?
//if(debug2) u_printf("skipping prev: |%C| \n", (char16_t)uch);
} else {
//if(debug2) u_printf("not skipping prev: |%C| \n", (char16_t)uch);
utext_next32(fText.getAlias());
//if(debug2) u_printf(" -> : |%C| \n", (char16_t)uch);
}
{
// Do not modify the shared trie!
UCharsTrie iter(fData->getBackwardsTrie());
UChar32 uch;
while((uch=utext_previous32(fText.getAlias()))!=U_SENTINEL) { // more to consume backwards
UStringTrieResult r = iter.nextForCodePoint(uch);
if(USTRINGTRIE_HAS_VALUE(r)) { // remember the best match so far
bestPosn = utext_getNativeIndex(fText.getAlias());
bestValue = iter.getValue();
}
if(!USTRINGTRIE_HAS_NEXT(r)) {
break;
}
//if(debug2) u_printf("rev< /%C/ cont?%d @%d\n", (char16_t)uch, r, utext_getNativeIndex(fText.getAlias()));
}
}
//if(bestValue >= 0) {
//if(debug2) u_printf("rev<+/%C/+end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (char16_t)uch, r, bestPosn, bestValue);
//}
if(bestPosn>=0) {
//if(debug2) u_printf("rev< /%C/ end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (char16_t)uch, r, bestPosn, bestValue);
//if(USTRINGTRIE_MATCHES(r)) { // matched - so, now what?
//int32_t bestValue = iter.getValue();
////if(debug2) u_printf("rev< /%C/ matched, skip..%d bestValue=%d\n", (char16_t)uch, r, bestValue);
if(bestValue == kMATCH) { // exact match!
//if(debug2) u_printf(" exact backward match\n");
return kExceptionHere; // See if the next is another exception.
} else if(bestValue == kPARTIAL
&& fData->hasForwardsPartialTrie()) { // make sure there's a forward trie
//if(debug2) u_printf(" partial backward match\n");
// We matched the "Ph." in "Ph.D." - now we need to run everything through the forwards trie
// to see if it matches something going forward.
UStringTrieResult rfwd = USTRINGTRIE_INTERMEDIATE_VALUE;
utext_setNativeIndex(fText.getAlias(), bestPosn); // hope that's close ..
//if(debug2) u_printf("Retrying at %d\n", bestPosn);
// Do not modify the shared trie!
UCharsTrie iter(fData->getForwardsPartialTrie());
UChar32 uch;
while((uch=utext_next32(fText.getAlias()))!=U_SENTINEL &&
USTRINGTRIE_HAS_NEXT(rfwd=iter.nextForCodePoint(uch))) {
//if(debug2) u_printf("fwd> /%C/ cont?%d @%d\n", (char16_t)uch, rfwd, utext_getNativeIndex(fText.getAlias()));
}
if(USTRINGTRIE_MATCHES(rfwd)) {
//if(debug2) u_printf("fwd> /%C/ == forward match!\n", (char16_t)uch);
// only full matches here, nothing to check
// skip the next:
return kExceptionHere;
} else {
//if(debug2) u_printf("fwd> /%C/ no match.\n", (char16_t)uch);
// no match (no exception) -return the 'underlying' break
return kNoExceptionHere;
}
} else {
return kNoExceptionHere; // internal error and/or no forwards trie
}
} else {
//if(debug2) u_printf("rev< /%C/ .. no match..%d\n", (char16_t)uch, r); // no best match
return kNoExceptionHere; // No match - so exit. Not an exception.
}
}
// the workhorse single next.
int32_t
SimpleFilteredSentenceBreakIterator::internalNext(int32_t n) {
if(n == UBRK_DONE || // at end or
!fData->hasBackwardsTrie()) { // .. no backwards table loaded == no exceptions
return n;
}
// OK, do we need to break here?
UErrorCode status = U_ZERO_ERROR;
// refresh text
resetState(status);
if(U_FAILURE(status)) return UBRK_DONE; // bail out
int64_t utextLen = utext_nativeLength(fText.getAlias());
//if(debug2) u_printf("str, native len=%d\n", utext_nativeLength(fText.getAlias()));
while (n != UBRK_DONE && n != utextLen) { // outer loop runs once per underlying break (from fDelegate).
SimpleFilteredSentenceBreakIterator::EFBMatchResult m = breakExceptionAt(n);
switch(m) {
case kExceptionHere:
n = fDelegate->next(); // skip this one. Find the next lowerlevel break.
continue;
default:
case kNoExceptionHere:
return n;
}
}
return n;
}
int32_t
SimpleFilteredSentenceBreakIterator::internalPrev(int32_t n) {
if(n == 0 || n == UBRK_DONE || // at end or
!fData->hasBackwardsTrie()) { // .. no backwards table loaded == no exceptions
return n;
}
// OK, do we need to break here?
UErrorCode status = U_ZERO_ERROR;
// refresh text
resetState(status);
if(U_FAILURE(status)) return UBRK_DONE; // bail out
//if(debug2) u_printf("str, native len=%d\n", utext_nativeLength(fText.getAlias()));
while (n != UBRK_DONE && n != 0) { // outer loop runs once per underlying break (from fDelegate).
SimpleFilteredSentenceBreakIterator::EFBMatchResult m = breakExceptionAt(n);
switch(m) {
case kExceptionHere:
n = fDelegate->previous(); // skip this one. Find the next lowerlevel break.
continue;
default:
case kNoExceptionHere:
return n;
}
}
return n;
}
int32_t
SimpleFilteredSentenceBreakIterator::next() {
return internalNext(fDelegate->next());
}
int32_t
SimpleFilteredSentenceBreakIterator::first() {
// Don't suppress a break opportunity at the beginning of text.
return fDelegate->first();
}
int32_t
SimpleFilteredSentenceBreakIterator::preceding(int32_t offset) {
return internalPrev(fDelegate->preceding(offset));
}
int32_t
SimpleFilteredSentenceBreakIterator::previous() {
return internalPrev(fDelegate->previous());
}
UBool SimpleFilteredSentenceBreakIterator::isBoundary(int32_t offset) {
if (!fDelegate->isBoundary(offset)) return false; // no break to suppress
if (!fData->hasBackwardsTrie()) return true; // no data = no suppressions
UErrorCode status = U_ZERO_ERROR;
resetState(status);
SimpleFilteredSentenceBreakIterator::EFBMatchResult m = breakExceptionAt(offset);
switch(m) {
case kExceptionHere:
return false;
default:
case kNoExceptionHere:
return true;
}
}
int32_t
SimpleFilteredSentenceBreakIterator::next(int32_t offset) {
return internalNext(fDelegate->next(offset));
}
int32_t
SimpleFilteredSentenceBreakIterator::following(int32_t offset) {
return internalNext(fDelegate->following(offset));
}
int32_t
SimpleFilteredSentenceBreakIterator::last() {
// Don't suppress a break opportunity at the end of text.
return fDelegate->last();
}
/**
* Concrete implementation of builder class.
*/
class SimpleFilteredBreakIteratorBuilder : public FilteredBreakIteratorBuilder {
public:
virtual ~SimpleFilteredBreakIteratorBuilder();
SimpleFilteredBreakIteratorBuilder(const Locale &fromLocale, UErrorCode &status);
SimpleFilteredBreakIteratorBuilder(UErrorCode &status);
virtual UBool suppressBreakAfter(const UnicodeString& exception, UErrorCode& status) override;
virtual UBool unsuppressBreakAfter(const UnicodeString& exception, UErrorCode& status) override;
virtual BreakIterator *build(BreakIterator* adoptBreakIterator, UErrorCode& status) override;
private:
UStringSet fSet;
};
SimpleFilteredBreakIteratorBuilder::~SimpleFilteredBreakIteratorBuilder()
{
}
SimpleFilteredBreakIteratorBuilder::SimpleFilteredBreakIteratorBuilder(UErrorCode &status)
: fSet(status)
{
}
SimpleFilteredBreakIteratorBuilder::SimpleFilteredBreakIteratorBuilder(const Locale &fromLocale, UErrorCode &status)
: fSet(status)
{
if(U_SUCCESS(status)) {
UErrorCode subStatus = U_ZERO_ERROR;
LocalUResourceBundlePointer b(ures_open(U_ICUDATA_BRKITR, fromLocale.getBaseName(), &subStatus));
if (U_FAILURE(subStatus) || (subStatus == U_USING_DEFAULT_WARNING) ) {
status = subStatus; // copy the failing status
#if FB_DEBUG
fprintf(stderr, "open BUNDLE %s : %s, %s\n", fromLocale.getBaseName(), "[exit]", u_errorName(status));
#endif
return; // leaves the builder empty, if you try to use it.
}
LocalUResourceBundlePointer exceptions(ures_getByKeyWithFallback(b.getAlias(), "exceptions", nullptr, &subStatus));
if (U_FAILURE(subStatus) || (subStatus == U_USING_DEFAULT_WARNING) ) {
status = subStatus; // copy the failing status
#if FB_DEBUG
fprintf(stderr, "open EXCEPTIONS %s : %s, %s\n", fromLocale.getBaseName(), "[exit]", u_errorName(status));
#endif
return; // leaves the builder empty, if you try to use it.
}
LocalUResourceBundlePointer breaks(ures_getByKeyWithFallback(exceptions.getAlias(), "SentenceBreak", nullptr, &subStatus));
#if FB_DEBUG
{
UErrorCode subsub = subStatus;
fprintf(stderr, "open SentenceBreak %s => %s, %s\n", fromLocale.getBaseName(), ures_getLocale(breaks.getAlias(), &subsub), u_errorName(subStatus));
}
#endif
if (U_FAILURE(subStatus) || (subStatus == U_USING_DEFAULT_WARNING) ) {
status = subStatus; // copy the failing status
#if FB_DEBUG
fprintf(stderr, "open %s : %s, %s\n", fromLocale.getBaseName(), "[exit]", u_errorName(status));
#endif
return; // leaves the builder empty, if you try to use it.
}
LocalUResourceBundlePointer strs;
subStatus = status; // Pick up inherited warning status now
do {
strs.adoptInstead(ures_getNextResource(breaks.getAlias(), strs.orphan(), &subStatus));
if(strs.isValid() && U_SUCCESS(subStatus)) {
UnicodeString str(ures_getUnicodeString(strs.getAlias(), &status));
suppressBreakAfter(str, status); // load the string
}
} while (strs.isValid() && U_SUCCESS(subStatus));
if(U_FAILURE(subStatus)&&subStatus!=U_INDEX_OUTOFBOUNDS_ERROR&&U_SUCCESS(status)) {
status = subStatus;
}
}
}
UBool
SimpleFilteredBreakIteratorBuilder::suppressBreakAfter(const UnicodeString& exception, UErrorCode& status)
{
UBool r = fSet.add(exception, status);
FB_TRACE("suppressBreakAfter",&exception,r,0);
return r;
}
UBool
SimpleFilteredBreakIteratorBuilder::unsuppressBreakAfter(const UnicodeString& exception, UErrorCode& status)
{
UBool r = fSet.remove(exception, status);
FB_TRACE("unsuppressBreakAfter",&exception,r,0);
return r;
}
/**
* Jitterbug 2974: MSVC has a bug whereby new X[0] behaves badly.
* Work around this.
*
* Note: "new UnicodeString[subCount]" ends up calling global operator new
* on MSVC2012 for some reason.
*/
static inline UnicodeString* newUnicodeStringArray(size_t count) {
return new UnicodeString[count ? count : 1];
}
BreakIterator *
SimpleFilteredBreakIteratorBuilder::build(BreakIterator* adoptBreakIterator, UErrorCode& status) {
LocalPointer<BreakIterator> adopt(adoptBreakIterator);
LocalPointer<UCharsTrieBuilder> builder(new UCharsTrieBuilder(status), status);
LocalPointer<UCharsTrieBuilder> builder2(new UCharsTrieBuilder(status), status);
if(U_FAILURE(status)) {
return nullptr;
}
int32_t revCount = 0;
int32_t fwdCount = 0;
int32_t subCount = fSet.size();
UnicodeString *ustrs_ptr = newUnicodeStringArray(subCount);
LocalArray<UnicodeString> ustrs(ustrs_ptr);
LocalMemory<int> partials;
partials.allocateInsteadAndReset(subCount);
LocalPointer<UCharsTrie> backwardsTrie; // i.e. ".srM" for Mrs.
LocalPointer<UCharsTrie> forwardsPartialTrie; // Has ".a" for "a.M."
int n=0;
for ( int32_t i = 0;
i<fSet.size();
i++) {
const UnicodeString *abbr = fSet.getStringAt(i);
if(abbr) {
FB_TRACE("build",abbr,true,i);
ustrs[n] = *abbr; // copy by value
FB_TRACE("ustrs[n]",&ustrs[n],true,i);
} else {
FB_TRACE("build",abbr,false,i);
status = U_MEMORY_ALLOCATION_ERROR;
return nullptr;
}
partials[n] = 0; // default: not partial
n++;
}
// first pass - find partials.
for(int i=0;i<subCount;i++) {
int nn = ustrs[i].indexOf(kFULLSTOP); // TODO: non-'.' abbreviations
if(nn>-1 && (nn+1)!=ustrs[i].length()) {
FB_TRACE("partial",&ustrs[i],false,i);
// is partial.
// is it unique?
int sameAs = -1;
for(int j=0;j<subCount;j++) {
if(j==i) continue;
if(ustrs[i].compare(0,nn+1,ustrs[j],0,nn+1)==0) {
FB_TRACE("prefix",&ustrs[j],false,nn+1);
//UBool otherIsPartial = ((nn+1)!=ustrs[j].length()); // true if ustrs[j] doesn't end at nn
if(partials[j]==0) { // hasn't been processed yet
partials[j] = kSuppressInReverse | kAddToForward;
FB_TRACE("suppressing",&ustrs[j],false,j);
} else if(partials[j] & kSuppressInReverse) {
sameAs = j; // the other entry is already in the reverse table.
}
}
}
FB_TRACE("for partial same-",&ustrs[i],false,sameAs);
FB_TRACE(" == partial #",&ustrs[i],false,partials[i]);
UnicodeString prefix(ustrs[i], 0, nn+1);
if(sameAs == -1 && partials[i] == 0) {
// first one - add the prefix to the reverse table.
prefix.reverse();
builder->add(prefix, kPARTIAL, status);
revCount++;
FB_TRACE("Added partial",&prefix,false, i);
FB_TRACE(u_errorName(status),&ustrs[i],false,i);
partials[i] = kSuppressInReverse | kAddToForward;
} else {
FB_TRACE("NOT adding partial",&prefix,false, i);
FB_TRACE(u_errorName(status),&ustrs[i],false,i);
}
}
}
for(int i=0;i<subCount;i++) {
if(partials[i]==0) {
ustrs[i].reverse();
builder->add(ustrs[i], kMATCH, status);
revCount++;
FB_TRACE(u_errorName(status), &ustrs[i], false, i);
} else {
FB_TRACE("Adding fwd",&ustrs[i], false, i);
// an optimization would be to only add the portion after the '.'
// for example, for "Ph.D." we store ".hP" in the reverse table. We could just store "D." in the forward,
// instead of "Ph.D." since we already know the "Ph." part is a match.
// would need the trie to be able to hold 0-length strings, though.
builder2->add(ustrs[i], kMATCH, status); // forward
fwdCount++;
//ustrs[i].reverse();
////if(debug2) u_printf("SUPPRESS- not Added(%d): /%S/ status=%s\n",partials[i], ustrs[i].getTerminatedBuffer(), u_errorName(status));
}
}
FB_TRACE("AbbrCount",nullptr,false, subCount);
if(revCount>0) {
backwardsTrie.adoptInstead(builder->build(USTRINGTRIE_BUILD_FAST, status));
if(U_FAILURE(status)) {
FB_TRACE(u_errorName(status),nullptr,false, -1);
return nullptr;
}
}
if(fwdCount>0) {
forwardsPartialTrie.adoptInstead(builder2->build(USTRINGTRIE_BUILD_FAST, status));
if(U_FAILURE(status)) {
FB_TRACE(u_errorName(status),nullptr,false, -1);
return nullptr;
}
}
return new SimpleFilteredSentenceBreakIterator(adopt.orphan(), forwardsPartialTrie.orphan(), backwardsTrie.orphan(), status);
}
// ----------- Base class implementation
FilteredBreakIteratorBuilder::FilteredBreakIteratorBuilder() {
}
FilteredBreakIteratorBuilder::~FilteredBreakIteratorBuilder() {
}
FilteredBreakIteratorBuilder *
FilteredBreakIteratorBuilder::createInstance(const Locale& where, UErrorCode& status) {
if(U_FAILURE(status)) return nullptr;
LocalPointer<FilteredBreakIteratorBuilder> ret(new SimpleFilteredBreakIteratorBuilder(where, status), status);
return (U_SUCCESS(status))? ret.orphan(): nullptr;
}
FilteredBreakIteratorBuilder *
FilteredBreakIteratorBuilder::createInstance(UErrorCode &status) {
return createEmptyInstance(status);
}
FilteredBreakIteratorBuilder *
FilteredBreakIteratorBuilder::createEmptyInstance(UErrorCode& status) {
if(U_FAILURE(status)) return nullptr;
LocalPointer<FilteredBreakIteratorBuilder> ret(new SimpleFilteredBreakIteratorBuilder(status), status);
return (U_SUCCESS(status))? ret.orphan(): nullptr;
}
U_NAMESPACE_END
#endif //#if !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_FILTERED_BREAK_ITERATION

View file

@ -0,0 +1,363 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
*
* Copyright (C) 2009-2012, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: filterednormalizer2.cpp
* encoding: UTF-8
* tab size: 8 (not used)
* indentation:4
*
* created on: 2009dec10
* created by: Markus W. Scherer
*/
#include "unicode/utypes.h"
#if !UCONFIG_NO_NORMALIZATION
#include "unicode/edits.h"
#include "unicode/normalizer2.h"
#include "unicode/stringoptions.h"
#include "unicode/uniset.h"
#include "unicode/unistr.h"
#include "unicode/unorm.h"
#include "cpputils.h"
U_NAMESPACE_BEGIN
FilteredNormalizer2::~FilteredNormalizer2() {}
UnicodeString &
FilteredNormalizer2::normalize(const UnicodeString &src,
UnicodeString &dest,
UErrorCode &errorCode) const {
uprv_checkCanGetBuffer(src, errorCode);
if(U_FAILURE(errorCode)) {
dest.setToBogus();
return dest;
}
if(&dest==&src) {
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
return dest;
}
dest.remove();
return normalize(src, dest, USET_SPAN_SIMPLE, errorCode);
}
// Internal: No argument checking, and appends to dest.
// Pass as input spanCondition the one that is likely to yield a non-zero
// span length at the start of src.
// For set=[:age=3.2:], since almost all common characters were in Unicode 3.2,
// USET_SPAN_SIMPLE should be passed in for the start of src
// and USET_SPAN_NOT_CONTAINED should be passed in if we continue after
// an in-filter prefix.
UnicodeString &
FilteredNormalizer2::normalize(const UnicodeString &src,
UnicodeString &dest,
USetSpanCondition spanCondition,
UErrorCode &errorCode) const {
UnicodeString tempDest; // Don't throw away destination buffer between iterations.
for(int32_t prevSpanLimit=0; prevSpanLimit<src.length();) {
int32_t spanLimit=set.span(src, prevSpanLimit, spanCondition);
int32_t spanLength=spanLimit-prevSpanLimit;
if(spanCondition==USET_SPAN_NOT_CONTAINED) {
if(spanLength!=0) {
dest.append(src, prevSpanLimit, spanLength);
}
spanCondition=USET_SPAN_SIMPLE;
} else {
if(spanLength!=0) {
// Not norm2.normalizeSecondAndAppend() because we do not want
// to modify the non-filter part of dest.
dest.append(norm2.normalize(src.tempSubStringBetween(prevSpanLimit, spanLimit),
tempDest, errorCode));
if(U_FAILURE(errorCode)) {
break;
}
}
spanCondition=USET_SPAN_NOT_CONTAINED;
}
prevSpanLimit=spanLimit;
}
return dest;
}
void
FilteredNormalizer2::normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
Edits *edits, UErrorCode &errorCode) const {
if (U_FAILURE(errorCode)) {
return;
}
if (edits != nullptr && (options & U_EDITS_NO_RESET) == 0) {
edits->reset();
}
options |= U_EDITS_NO_RESET; // Do not reset for each span.
normalizeUTF8(options, src.data(), src.length(), sink, edits, USET_SPAN_SIMPLE, errorCode);
}
void
FilteredNormalizer2::normalizeUTF8(uint32_t options, const char *src, int32_t length,
ByteSink &sink, Edits *edits,
USetSpanCondition spanCondition,
UErrorCode &errorCode) const {
while (length > 0) {
int32_t spanLength = set.spanUTF8(src, length, spanCondition);
if (spanCondition == USET_SPAN_NOT_CONTAINED) {
if (spanLength != 0) {
if (edits != nullptr) {
edits->addUnchanged(spanLength);
}
if ((options & U_OMIT_UNCHANGED_TEXT) == 0) {
sink.Append(src, spanLength);
}
}
spanCondition = USET_SPAN_SIMPLE;
} else {
if (spanLength != 0) {
// Not norm2.normalizeSecondAndAppend() because we do not want
// to modify the non-filter part of dest.
norm2.normalizeUTF8(options, StringPiece(src, spanLength), sink, edits, errorCode);
if (U_FAILURE(errorCode)) {
break;
}
}
spanCondition = USET_SPAN_NOT_CONTAINED;
}
src += spanLength;
length -= spanLength;
}
}
UnicodeString &
FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first,
const UnicodeString &second,
UErrorCode &errorCode) const {
return normalizeSecondAndAppend(first, second, true, errorCode);
}
UnicodeString &
FilteredNormalizer2::append(UnicodeString &first,
const UnicodeString &second,
UErrorCode &errorCode) const {
return normalizeSecondAndAppend(first, second, false, errorCode);
}
UnicodeString &
FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first,
const UnicodeString &second,
UBool doNormalize,
UErrorCode &errorCode) const {
uprv_checkCanGetBuffer(first, errorCode);
uprv_checkCanGetBuffer(second, errorCode);
if(U_FAILURE(errorCode)) {
return first;
}
if(&first==&second) {
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
return first;
}
if(first.isEmpty()) {
if(doNormalize) {
return normalize(second, first, errorCode);
} else {
return first=second;
}
}
// merge the in-filter suffix of the first string with the in-filter prefix of the second
int32_t prefixLimit=set.span(second, 0, USET_SPAN_SIMPLE);
if(prefixLimit!=0) {
UnicodeString prefix(second.tempSubString(0, prefixLimit));
int32_t suffixStart=set.spanBack(first, INT32_MAX, USET_SPAN_SIMPLE);
if(suffixStart==0) {
if(doNormalize) {
norm2.normalizeSecondAndAppend(first, prefix, errorCode);
} else {
norm2.append(first, prefix, errorCode);
}
} else {
UnicodeString middle(first, suffixStart, INT32_MAX);
if(doNormalize) {
norm2.normalizeSecondAndAppend(middle, prefix, errorCode);
} else {
norm2.append(middle, prefix, errorCode);
}
first.replace(suffixStart, INT32_MAX, middle);
}
}
if(prefixLimit<second.length()) {
UnicodeString rest(second.tempSubString(prefixLimit, INT32_MAX));
if(doNormalize) {
normalize(rest, first, USET_SPAN_NOT_CONTAINED, errorCode);
} else {
first.append(rest);
}
}
return first;
}
UBool
FilteredNormalizer2::getDecomposition(UChar32 c, UnicodeString &decomposition) const {
return set.contains(c) && norm2.getDecomposition(c, decomposition);
}
UBool
FilteredNormalizer2::getRawDecomposition(UChar32 c, UnicodeString &decomposition) const {
return set.contains(c) && norm2.getRawDecomposition(c, decomposition);
}
UChar32
FilteredNormalizer2::composePair(UChar32 a, UChar32 b) const {
return (set.contains(a) && set.contains(b)) ? norm2.composePair(a, b) : U_SENTINEL;
}
uint8_t
FilteredNormalizer2::getCombiningClass(UChar32 c) const {
return set.contains(c) ? norm2.getCombiningClass(c) : 0;
}
UBool
FilteredNormalizer2::isNormalized(const UnicodeString &s, UErrorCode &errorCode) const {
uprv_checkCanGetBuffer(s, errorCode);
if(U_FAILURE(errorCode)) {
return false;
}
USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
if(spanCondition==USET_SPAN_NOT_CONTAINED) {
spanCondition=USET_SPAN_SIMPLE;
} else {
if( !norm2.isNormalized(s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode) ||
U_FAILURE(errorCode)
) {
return false;
}
spanCondition=USET_SPAN_NOT_CONTAINED;
}
prevSpanLimit=spanLimit;
}
return true;
}
UBool
FilteredNormalizer2::isNormalizedUTF8(StringPiece sp, UErrorCode &errorCode) const {
if(U_FAILURE(errorCode)) {
return false;
}
const char *s = sp.data();
int32_t length = sp.length();
USetSpanCondition spanCondition = USET_SPAN_SIMPLE;
while (length > 0) {
int32_t spanLength = set.spanUTF8(s, length, spanCondition);
if (spanCondition == USET_SPAN_NOT_CONTAINED) {
spanCondition = USET_SPAN_SIMPLE;
} else {
if (!norm2.isNormalizedUTF8(StringPiece(s, spanLength), errorCode) ||
U_FAILURE(errorCode)) {
return false;
}
spanCondition = USET_SPAN_NOT_CONTAINED;
}
s += spanLength;
length -= spanLength;
}
return true;
}
UNormalizationCheckResult
FilteredNormalizer2::quickCheck(const UnicodeString &s, UErrorCode &errorCode) const {
uprv_checkCanGetBuffer(s, errorCode);
if(U_FAILURE(errorCode)) {
return UNORM_MAYBE;
}
UNormalizationCheckResult result=UNORM_YES;
USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
if(spanCondition==USET_SPAN_NOT_CONTAINED) {
spanCondition=USET_SPAN_SIMPLE;
} else {
UNormalizationCheckResult qcResult=
norm2.quickCheck(s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode);
if(U_FAILURE(errorCode) || qcResult==UNORM_NO) {
return qcResult;
} else if(qcResult==UNORM_MAYBE) {
result=qcResult;
}
spanCondition=USET_SPAN_NOT_CONTAINED;
}
prevSpanLimit=spanLimit;
}
return result;
}
int32_t
FilteredNormalizer2::spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const {
uprv_checkCanGetBuffer(s, errorCode);
if(U_FAILURE(errorCode)) {
return 0;
}
USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
if(spanCondition==USET_SPAN_NOT_CONTAINED) {
spanCondition=USET_SPAN_SIMPLE;
} else {
int32_t yesLimit=
prevSpanLimit+
norm2.spanQuickCheckYes(
s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode);
if(U_FAILURE(errorCode) || yesLimit<spanLimit) {
return yesLimit;
}
spanCondition=USET_SPAN_NOT_CONTAINED;
}
prevSpanLimit=spanLimit;
}
return s.length();
}
UBool
FilteredNormalizer2::hasBoundaryBefore(UChar32 c) const {
return !set.contains(c) || norm2.hasBoundaryBefore(c);
}
UBool
FilteredNormalizer2::hasBoundaryAfter(UChar32 c) const {
return !set.contains(c) || norm2.hasBoundaryAfter(c);
}
UBool
FilteredNormalizer2::isInert(UChar32 c) const {
return !set.contains(c) || norm2.isInert(c);
}
U_NAMESPACE_END
// C API ------------------------------------------------------------------- ***
U_NAMESPACE_USE
U_CAPI UNormalizer2 * U_EXPORT2
unorm2_openFiltered(const UNormalizer2 *norm2, const USet *filterSet, UErrorCode *pErrorCode) {
if(U_FAILURE(*pErrorCode)) {
return nullptr;
}
if(filterSet==nullptr) {
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
return nullptr;
}
Normalizer2 *fn2=new FilteredNormalizer2(*(Normalizer2 *)norm2,
*UnicodeSet::fromUSet(filterSet));
if(fn2==nullptr) {
*pErrorCode=U_MEMORY_ALLOCATION_ERROR;
}
return (UNormalizer2 *)fn2;
}
#endif // !UCONFIG_NO_NORMALIZATION

267
engine/thirdparty/icu4c/common/hash.h vendored Normal file
View file

@ -0,0 +1,267 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
******************************************************************************
* Copyright (C) 1997-2014, International Business Machines
* Corporation and others. All Rights Reserved.
******************************************************************************
* Date Name Description
* 03/28/00 aliu Creation.
******************************************************************************
*/
#ifndef HASH_H
#define HASH_H
#include "unicode/unistr.h"
#include "unicode/uobject.h"
#include "cmemory.h"
#include "uhash.h"
U_NAMESPACE_BEGIN
/**
* Hashtable is a thin C++ wrapper around UHashtable, a general-purpose void*
* hashtable implemented in C. Hashtable is designed to be idiomatic and
* easy-to-use in C++.
*
* Hashtable is an INTERNAL CLASS.
*/
class U_COMMON_API Hashtable : public UMemory {
UHashtable* hash;
UHashtable hashObj;
inline void init(UHashFunction *keyHash, UKeyComparator *keyComp, UValueComparator *valueComp, UErrorCode& status);
inline void initSize(UHashFunction *keyHash, UKeyComparator *keyComp, UValueComparator *valueComp, int32_t size, UErrorCode& status);
public:
/**
* Construct a hashtable
* @param ignoreKeyCase If true, keys are case insensitive.
* @param status Error code
*/
inline Hashtable(UBool ignoreKeyCase, UErrorCode& status);
/**
* Construct a hashtable
* @param ignoreKeyCase If true, keys are case insensitive.
* @param size initial size allocation
* @param status Error code
*/
inline Hashtable(UBool ignoreKeyCase, int32_t size, UErrorCode& status);
/**
* Construct a hashtable
* @param keyComp Comparator for comparing the keys
* @param valueComp Comparator for comparing the values
* @param status Error code
*/
inline Hashtable(UKeyComparator *keyComp, UValueComparator *valueComp, UErrorCode& status);
/**
* Construct a hashtable
* @param status Error code
*/
inline Hashtable(UErrorCode& status);
/**
* Construct a hashtable, _disregarding any error_. Use this constructor
* with caution.
*/
inline Hashtable();
/**
* Non-virtual destructor; make this virtual if Hashtable is subclassed
* in the future.
*/
inline ~Hashtable();
inline UObjectDeleter *setValueDeleter(UObjectDeleter *fn);
inline int32_t count() const;
inline void* put(const UnicodeString& key, void* value, UErrorCode& status);
inline int32_t puti(const UnicodeString& key, int32_t value, UErrorCode& status);
inline int32_t putiAllowZero(const UnicodeString& key, int32_t value, UErrorCode& status);
inline void* get(const UnicodeString& key) const;
inline int32_t geti(const UnicodeString& key) const;
inline int32_t getiAndFound(const UnicodeString& key, UBool &found) const;
inline void* remove(const UnicodeString& key);
inline int32_t removei(const UnicodeString& key);
inline void removeAll();
inline UBool containsKey(const UnicodeString& key) const;
inline const UHashElement* find(const UnicodeString& key) const;
/**
* @param pos - must be UHASH_FIRST on first call, and untouched afterwards.
* @see uhash_nextElement
*/
inline const UHashElement* nextElement(int32_t& pos) const;
inline UKeyComparator* setKeyComparator(UKeyComparator*keyComp);
inline UValueComparator* setValueComparator(UValueComparator* valueComp);
inline UBool equals(const Hashtable& that) const;
private:
Hashtable(const Hashtable &other) = delete; // forbid copying of this class
Hashtable &operator=(const Hashtable &other) = delete; // forbid copying of this class
};
/*********************************************************************
* Implementation
********************************************************************/
inline void Hashtable::init(UHashFunction *keyHash, UKeyComparator *keyComp,
UValueComparator *valueComp, UErrorCode& status) {
if (U_FAILURE(status)) {
return;
}
uhash_init(&hashObj, keyHash, keyComp, valueComp, &status);
if (U_SUCCESS(status)) {
hash = &hashObj;
uhash_setKeyDeleter(hash, uprv_deleteUObject);
}
}
inline void Hashtable::initSize(UHashFunction *keyHash, UKeyComparator *keyComp,
UValueComparator *valueComp, int32_t size, UErrorCode& status) {
if (U_FAILURE(status)) {
return;
}
uhash_initSize(&hashObj, keyHash, keyComp, valueComp, size, &status);
if (U_SUCCESS(status)) {
hash = &hashObj;
uhash_setKeyDeleter(hash, uprv_deleteUObject);
}
}
inline Hashtable::Hashtable(UKeyComparator *keyComp, UValueComparator *valueComp,
UErrorCode& status) : hash(nullptr) {
init( uhash_hashUnicodeString, keyComp, valueComp, status);
}
inline Hashtable::Hashtable(UBool ignoreKeyCase, UErrorCode& status)
: hash(nullptr)
{
init(ignoreKeyCase ? uhash_hashCaselessUnicodeString
: uhash_hashUnicodeString,
ignoreKeyCase ? uhash_compareCaselessUnicodeString
: uhash_compareUnicodeString,
nullptr,
status);
}
inline Hashtable::Hashtable(UBool ignoreKeyCase, int32_t size, UErrorCode& status)
: hash(nullptr)
{
initSize(ignoreKeyCase ? uhash_hashCaselessUnicodeString
: uhash_hashUnicodeString,
ignoreKeyCase ? uhash_compareCaselessUnicodeString
: uhash_compareUnicodeString,
nullptr, size,
status);
}
inline Hashtable::Hashtable(UErrorCode& status)
: hash(nullptr)
{
init(uhash_hashUnicodeString, uhash_compareUnicodeString, nullptr, status);
}
inline Hashtable::Hashtable()
: hash(nullptr)
{
UErrorCode status = U_ZERO_ERROR;
init(uhash_hashUnicodeString, uhash_compareUnicodeString, nullptr, status);
}
inline Hashtable::~Hashtable() {
if (hash != nullptr) {
uhash_close(hash);
}
}
inline UObjectDeleter *Hashtable::setValueDeleter(UObjectDeleter *fn) {
return uhash_setValueDeleter(hash, fn);
}
inline int32_t Hashtable::count() const {
return uhash_count(hash);
}
inline void* Hashtable::put(const UnicodeString& key, void* value, UErrorCode& status) {
return uhash_put(hash, new UnicodeString(key), value, &status);
}
inline int32_t Hashtable::puti(const UnicodeString& key, int32_t value, UErrorCode& status) {
return uhash_puti(hash, new UnicodeString(key), value, &status);
}
inline int32_t Hashtable::putiAllowZero(const UnicodeString& key, int32_t value,
UErrorCode& status) {
return uhash_putiAllowZero(hash, new UnicodeString(key), value, &status);
}
inline void* Hashtable::get(const UnicodeString& key) const {
return uhash_get(hash, &key);
}
inline int32_t Hashtable::geti(const UnicodeString& key) const {
return uhash_geti(hash, &key);
}
inline int32_t Hashtable::getiAndFound(const UnicodeString& key, UBool &found) const {
return uhash_getiAndFound(hash, &key, &found);
}
inline void* Hashtable::remove(const UnicodeString& key) {
return uhash_remove(hash, &key);
}
inline int32_t Hashtable::removei(const UnicodeString& key) {
return uhash_removei(hash, &key);
}
inline UBool Hashtable::containsKey(const UnicodeString& key) const {
return uhash_containsKey(hash, &key);
}
inline const UHashElement* Hashtable::find(const UnicodeString& key) const {
return uhash_find(hash, &key);
}
inline const UHashElement* Hashtable::nextElement(int32_t& pos) const {
return uhash_nextElement(hash, &pos);
}
inline void Hashtable::removeAll() {
uhash_removeAll(hash);
}
inline UKeyComparator* Hashtable::setKeyComparator(UKeyComparator*keyComp){
return uhash_setKeyComparator(hash, keyComp);
}
inline UValueComparator* Hashtable::setValueComparator(UValueComparator* valueComp){
return uhash_setValueComparator(hash, valueComp);
}
inline UBool Hashtable::equals(const Hashtable& that)const{
return uhash_equals(hash, that.hash);
}
U_NAMESPACE_END
#endif

View file

@ -0,0 +1,31 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
******************************************************************************
*
* Copyright (C) 2009-2011, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
*/
#include "unicode/utypes.h"
#include "unicode/icudataver.h"
#include "unicode/ures.h"
#include "uresimp.h" /* for ures_getVersionByKey */
U_CAPI void U_EXPORT2 u_getDataVersion(UVersionInfo dataVersionFillin, UErrorCode *status) {
UResourceBundle *icudatares = nullptr;
if (U_FAILURE(*status)) {
return;
}
if (dataVersionFillin != nullptr) {
icudatares = ures_openDirect(nullptr, U_ICU_VERSION_BUNDLE , status);
if (U_SUCCESS(*status)) {
ures_getVersionByKey(icudatares, U_ICU_DATA_KEY, dataVersionFillin, status);
}
ures_close(icudatares);
}
}

View file

@ -0,0 +1,884 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
******************************************************************************
*
* Copyright (C) 2009-2015, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
*
* FILE NAME : icuplug.c
*
* Date Name Description
* 10/29/2009 sl New.
******************************************************************************
*/
#include "unicode/icuplug.h"
#if UCONFIG_ENABLE_PLUGINS
#include "icuplugimp.h"
#include "cstring.h"
#include "cmemory.h"
#include "putilimp.h"
#include "ucln.h"
#include <stdio.h>
#ifdef __MVS__ /* defined by z/OS compiler */
#define _POSIX_SOURCE
#include <cics.h> /* 12 Nov 2011 JAM iscics() function */
#endif
#include "charstr.h"
using namespace icu;
#ifndef UPLUG_TRACE
#define UPLUG_TRACE 0
#endif
#if UPLUG_TRACE
#include <stdio.h>
#define DBG(x) fprintf(stderr, "%s:%d: ",__FILE__,__LINE__); fprintf x
#endif
/**
* Internal structure of an ICU plugin.
*/
struct UPlugData {
UPlugEntrypoint *entrypoint; /**< plugin entrypoint */
uint32_t structSize; /**< initialized to the size of this structure */
uint32_t token; /**< must be U_PLUG_TOKEN */
void *lib; /**< plugin library, or nullptr */
char libName[UPLUG_NAME_MAX]; /**< library name */
char sym[UPLUG_NAME_MAX]; /**< plugin symbol, or nullptr */
char config[UPLUG_NAME_MAX]; /**< configuration data */
void *context; /**< user context data */
char name[UPLUG_NAME_MAX]; /**< name of plugin */
UPlugLevel level; /**< level of plugin */
UBool awaitingLoad; /**< true if the plugin is awaiting a load call */
UBool dontUnload; /**< true if plugin must stay resident (leak plugin and lib) */
UErrorCode pluginStatus; /**< status code of plugin */
};
#define UPLUG_LIBRARY_INITIAL_COUNT 8
#define UPLUG_PLUGIN_INITIAL_COUNT 12
/**
* Remove an item
* @param list the full list
* @param listSize the number of entries in the list
* @param memberSize the size of one member
* @param itemToRemove the item number of the member
* @return the new listsize
*/
static int32_t uplug_removeEntryAt(void *list, int32_t listSize, int32_t memberSize, int32_t itemToRemove) {
uint8_t *bytePtr = (uint8_t *)list;
/* get rid of some bad cases first */
if(listSize<1) {
return listSize;
}
/* is there anything to move? */
if(listSize > itemToRemove+1) {
memmove(bytePtr+(itemToRemove*memberSize), bytePtr+((itemToRemove+1)*memberSize), memberSize);
}
return listSize-1;
}
#if U_ENABLE_DYLOAD
/**
* Library management. Internal.
* @internal
*/
struct UPlugLibrary;
/**
* Library management. Internal.
* @internal
*/
typedef struct UPlugLibrary {
void *lib; /**< library ptr */
char name[UPLUG_NAME_MAX]; /**< library name */
uint32_t ref; /**< reference count */
} UPlugLibrary;
static UPlugLibrary staticLibraryList[UPLUG_LIBRARY_INITIAL_COUNT];
static UPlugLibrary * libraryList = staticLibraryList;
static int32_t libraryCount = 0;
static int32_t libraryMax = UPLUG_LIBRARY_INITIAL_COUNT;
/**
* Search for a library. Doesn't lock
* @param libName libname to search for
* @return the library's struct
*/
static int32_t searchForLibraryName(const char *libName) {
int32_t i;
for(i=0;i<libraryCount;i++) {
if(!uprv_strcmp(libName, libraryList[i].name)) {
return i;
}
}
return -1;
}
static int32_t searchForLibrary(void *lib) {
int32_t i;
for(i=0;i<libraryCount;i++) {
if(lib==libraryList[i].lib) {
return i;
}
}
return -1;
}
U_CAPI char * U_EXPORT2
uplug_findLibrary(void *lib, UErrorCode *status) {
int32_t libEnt;
char *ret = nullptr;
if(U_FAILURE(*status)) {
return nullptr;
}
libEnt = searchForLibrary(lib);
if(libEnt!=-1) {
ret = libraryList[libEnt].name;
} else {
*status = U_MISSING_RESOURCE_ERROR;
}
return ret;
}
U_CAPI void * U_EXPORT2
uplug_openLibrary(const char *libName, UErrorCode *status) {
int32_t libEntry = -1;
void *lib = nullptr;
if(U_FAILURE(*status)) return nullptr;
libEntry = searchForLibraryName(libName);
if(libEntry == -1) {
libEntry = libraryCount++;
if(libraryCount >= libraryMax) {
/* Ran out of library slots. Statically allocated because we can't depend on allocating memory.. */
*status = U_MEMORY_ALLOCATION_ERROR;
#if UPLUG_TRACE
DBG((stderr, "uplug_openLibrary() - out of library slots (max %d)\n", libraryMax));
#endif
return nullptr;
}
/* Some operating systems don't want
DL operations from multiple threads. */
libraryList[libEntry].lib = uprv_dl_open(libName, status);
#if UPLUG_TRACE
DBG((stderr, "uplug_openLibrary(%s,%s) libEntry %d, lib %p\n", libName, u_errorName(*status), libEntry, lib));
#endif
if(libraryList[libEntry].lib == nullptr || U_FAILURE(*status)) {
/* cleanup. */
libraryList[libEntry].lib = nullptr; /* failure with open */
libraryList[libEntry].name[0] = 0;
#if UPLUG_TRACE
DBG((stderr, "uplug_openLibrary(%s,%s) libEntry %d, lib %p\n", libName, u_errorName(*status), libEntry, lib));
#endif
/* no need to free - just won't increase the count. */
libraryCount--;
} else { /* is it still there? */
/* link it in */
uprv_strncpy(libraryList[libEntry].name,libName,UPLUG_NAME_MAX);
libraryList[libEntry].ref=1;
lib = libraryList[libEntry].lib;
}
} else {
lib = libraryList[libEntry].lib;
libraryList[libEntry].ref++;
}
return lib;
}
U_CAPI void U_EXPORT2
uplug_closeLibrary(void *lib, UErrorCode *status) {
int32_t i;
#if UPLUG_TRACE
DBG((stderr, "uplug_closeLibrary(%p,%s) list %p\n", lib, u_errorName(*status), (void*)libraryList));
#endif
if(U_FAILURE(*status)) return;
for(i=0;i<libraryCount;i++) {
if(lib==libraryList[i].lib) {
if(--(libraryList[i].ref) == 0) {
uprv_dl_close(libraryList[i].lib, status);
libraryCount = uplug_removeEntryAt(libraryList, libraryCount, sizeof(*libraryList), i);
}
return;
}
}
*status = U_INTERNAL_PROGRAM_ERROR; /* could not find the entry! */
}
#endif
static UPlugData pluginList[UPLUG_PLUGIN_INITIAL_COUNT];
static int32_t pluginCount = 0;
static int32_t uplug_pluginNumber(UPlugData* d) {
UPlugData *pastPlug = &pluginList[pluginCount];
if(d<=pluginList) {
return 0;
} else if(d>=pastPlug) {
return pluginCount;
} else {
return (d-pluginList)/sizeof(pluginList[0]);
}
}
U_CAPI UPlugData * U_EXPORT2
uplug_nextPlug(UPlugData *prior) {
if(prior==nullptr) {
return pluginList;
} else {
UPlugData *nextPlug = &prior[1];
UPlugData *pastPlug = &pluginList[pluginCount];
if(nextPlug>=pastPlug) {
return nullptr;
} else {
return nextPlug;
}
}
}
/**
* Call the plugin with some params
*/
static void uplug_callPlug(UPlugData *plug, UPlugReason reason, UErrorCode *status) {
UPlugTokenReturn token;
if(plug==nullptr||U_FAILURE(*status)) {
return;
}
token = (*(plug->entrypoint))(plug, reason, status);
if(token!=UPLUG_TOKEN) {
*status = U_INTERNAL_PROGRAM_ERROR;
}
}
static void uplug_unloadPlug(UPlugData *plug, UErrorCode *status) {
if(plug->awaitingLoad) { /* shouldn't happen. Plugin hasn't been loaded yet.*/
*status = U_INTERNAL_PROGRAM_ERROR;
return;
}
if(U_SUCCESS(plug->pluginStatus)) {
/* Don't unload a plug which has a failing load status - means it didn't actually load. */
uplug_callPlug(plug, UPLUG_REASON_UNLOAD, status);
}
}
static void uplug_queryPlug(UPlugData *plug, UErrorCode *status) {
if(!plug->awaitingLoad || !(plug->level == UPLUG_LEVEL_UNKNOWN) ) { /* shouldn't happen. Plugin hasn't been loaded yet.*/
*status = U_INTERNAL_PROGRAM_ERROR;
return;
}
plug->level = UPLUG_LEVEL_INVALID;
uplug_callPlug(plug, UPLUG_REASON_QUERY, status);
if(U_SUCCESS(*status)) {
if(plug->level == UPLUG_LEVEL_INVALID) {
plug->pluginStatus = U_PLUGIN_DIDNT_SET_LEVEL;
plug->awaitingLoad = false;
}
} else {
plug->pluginStatus = U_INTERNAL_PROGRAM_ERROR;
plug->awaitingLoad = false;
}
}
static void uplug_loadPlug(UPlugData *plug, UErrorCode *status) {
if(U_FAILURE(*status)) {
return;
}
if(!plug->awaitingLoad || (plug->level < UPLUG_LEVEL_LOW) ) { /* shouldn't happen. Plugin hasn't been loaded yet.*/
*status = U_INTERNAL_PROGRAM_ERROR;
return;
}
uplug_callPlug(plug, UPLUG_REASON_LOAD, status);
plug->awaitingLoad = false;
if(!U_SUCCESS(*status)) {
plug->pluginStatus = U_INTERNAL_PROGRAM_ERROR;
}
}
static UPlugData *uplug_allocateEmptyPlug(UErrorCode *status)
{
UPlugData *plug = nullptr;
if(U_FAILURE(*status)) {
return nullptr;
}
if(pluginCount == UPLUG_PLUGIN_INITIAL_COUNT) {
*status = U_MEMORY_ALLOCATION_ERROR;
return nullptr;
}
plug = &pluginList[pluginCount++];
plug->token = UPLUG_TOKEN;
plug->structSize = sizeof(UPlugData);
plug->name[0]=0;
plug->level = UPLUG_LEVEL_UNKNOWN; /* initialize to null state */
plug->awaitingLoad = true;
plug->dontUnload = false;
plug->pluginStatus = U_ZERO_ERROR;
plug->libName[0] = 0;
plug->config[0]=0;
plug->sym[0]=0;
plug->lib=nullptr;
plug->entrypoint=nullptr;
return plug;
}
static UPlugData *uplug_allocatePlug(UPlugEntrypoint *entrypoint, const char *config, void *lib, const char *symName,
UErrorCode *status) {
UPlugData *plug = uplug_allocateEmptyPlug(status);
if(U_FAILURE(*status)) {
return nullptr;
}
if(config!=nullptr) {
uprv_strncpy(plug->config, config, UPLUG_NAME_MAX);
} else {
plug->config[0] = 0;
}
if(symName!=nullptr) {
uprv_strncpy(plug->sym, symName, UPLUG_NAME_MAX);
} else {
plug->sym[0] = 0;
}
plug->entrypoint = entrypoint;
plug->lib = lib;
uplug_queryPlug(plug, status);
return plug;
}
static void uplug_deallocatePlug(UPlugData *plug, UErrorCode *status) {
UErrorCode subStatus = U_ZERO_ERROR;
if(!plug->dontUnload) {
#if U_ENABLE_DYLOAD
uplug_closeLibrary(plug->lib, &subStatus);
#endif
}
plug->lib = nullptr;
if(U_SUCCESS(*status) && U_FAILURE(subStatus)) {
*status = subStatus;
}
/* shift plugins up and decrement count. */
if(U_SUCCESS(*status)) {
/* all ok- remove. */
pluginCount = uplug_removeEntryAt(pluginList, pluginCount, sizeof(plug[0]), uplug_pluginNumber(plug));
} else {
/* not ok- leave as a message. */
plug->awaitingLoad=false;
plug->entrypoint=0;
plug->dontUnload=true;
}
}
static void uplug_doUnloadPlug(UPlugData *plugToRemove, UErrorCode *status) {
if(plugToRemove != nullptr) {
uplug_unloadPlug(plugToRemove, status);
uplug_deallocatePlug(plugToRemove, status);
}
}
U_CAPI void U_EXPORT2
uplug_removePlug(UPlugData *plug, UErrorCode *status) {
UPlugData *cursor = nullptr;
UPlugData *plugToRemove = nullptr;
if(U_FAILURE(*status)) return;
for(cursor=pluginList;cursor!=nullptr;) {
if(cursor==plug) {
plugToRemove = plug;
cursor=nullptr;
} else {
cursor = uplug_nextPlug(cursor);
}
}
uplug_doUnloadPlug(plugToRemove, status);
}
U_CAPI void U_EXPORT2
uplug_setPlugNoUnload(UPlugData *data, UBool dontUnload)
{
data->dontUnload = dontUnload;
}
U_CAPI void U_EXPORT2
uplug_setPlugLevel(UPlugData *data, UPlugLevel level) {
data->level = level;
}
U_CAPI UPlugLevel U_EXPORT2
uplug_getPlugLevel(UPlugData *data) {
return data->level;
}
U_CAPI void U_EXPORT2
uplug_setPlugName(UPlugData *data, const char *name) {
uprv_strncpy(data->name, name, UPLUG_NAME_MAX);
}
U_CAPI const char * U_EXPORT2
uplug_getPlugName(UPlugData *data) {
return data->name;
}
U_CAPI const char * U_EXPORT2
uplug_getSymbolName(UPlugData *data) {
return data->sym;
}
U_CAPI const char * U_EXPORT2
uplug_getLibraryName(UPlugData *data, UErrorCode *status) {
if(data->libName[0]) {
return data->libName;
} else {
#if U_ENABLE_DYLOAD
return uplug_findLibrary(data->lib, status);
#else
return nullptr;
#endif
}
}
U_CAPI void * U_EXPORT2
uplug_getLibrary(UPlugData *data) {
return data->lib;
}
U_CAPI void * U_EXPORT2
uplug_getContext(UPlugData *data) {
return data->context;
}
U_CAPI void U_EXPORT2
uplug_setContext(UPlugData *data, void *context) {
data->context = context;
}
U_CAPI const char* U_EXPORT2
uplug_getConfiguration(UPlugData *data) {
return data->config;
}
U_CAPI UPlugData* U_EXPORT2
uplug_getPlugInternal(int32_t n) {
if(n <0 || n >= pluginCount) {
return nullptr;
} else {
return &(pluginList[n]);
}
}
U_CAPI UErrorCode U_EXPORT2
uplug_getPlugLoadStatus(UPlugData *plug) {
return plug->pluginStatus;
}
/**
* Initialize a plugin from an entrypoint and library - but don't load it.
*/
static UPlugData* uplug_initPlugFromEntrypointAndLibrary(UPlugEntrypoint *entrypoint, const char *config, void *lib, const char *sym,
UErrorCode *status) {
UPlugData *plug = nullptr;
plug = uplug_allocatePlug(entrypoint, config, lib, sym, status);
if(U_SUCCESS(*status)) {
return plug;
} else {
uplug_deallocatePlug(plug, status);
return nullptr;
}
}
U_CAPI UPlugData* U_EXPORT2
uplug_loadPlugFromEntrypoint(UPlugEntrypoint *entrypoint, const char *config, UErrorCode *status) {
UPlugData* plug = uplug_initPlugFromEntrypointAndLibrary(entrypoint, config, nullptr, nullptr, status);
uplug_loadPlug(plug, status);
return plug;
}
#if U_ENABLE_DYLOAD
static UPlugData*
uplug_initErrorPlug(const char *libName, const char *sym, const char *config, const char *nameOrError, UErrorCode loadStatus, UErrorCode *status)
{
UPlugData *plug = uplug_allocateEmptyPlug(status);
if(U_FAILURE(*status)) return nullptr;
plug->pluginStatus = loadStatus;
plug->awaitingLoad = false; /* Won't load. */
plug->dontUnload = true; /* cannot unload. */
if(sym!=nullptr) {
uprv_strncpy(plug->sym, sym, UPLUG_NAME_MAX);
}
if(libName!=nullptr) {
uprv_strncpy(plug->libName, libName, UPLUG_NAME_MAX);
}
if(nameOrError!=nullptr) {
uprv_strncpy(plug->name, nameOrError, UPLUG_NAME_MAX);
}
if(config!=nullptr) {
uprv_strncpy(plug->config, config, UPLUG_NAME_MAX);
}
return plug;
}
/**
* Fetch a plugin from DLL, and then initialize it from a library- but don't load it.
*/
static UPlugData*
uplug_initPlugFromLibrary(const char *libName, const char *sym, const char *config, UErrorCode *status) {
void *lib = nullptr;
UPlugData *plug = nullptr;
if(U_FAILURE(*status)) { return nullptr; }
lib = uplug_openLibrary(libName, status);
if(lib!=nullptr && U_SUCCESS(*status)) {
UPlugEntrypoint *entrypoint = nullptr;
entrypoint = (UPlugEntrypoint*)uprv_dlsym_func(lib, sym, status);
if(entrypoint!=nullptr&&U_SUCCESS(*status)) {
plug = uplug_initPlugFromEntrypointAndLibrary(entrypoint, config, lib, sym, status);
if(plug!=nullptr&&U_SUCCESS(*status)) {
plug->lib = lib; /* plug takes ownership of library */
lib = nullptr; /* library is now owned by plugin. */
}
} else {
UErrorCode subStatus = U_ZERO_ERROR;
plug = uplug_initErrorPlug(libName,sym,config,"ERROR: Could not load entrypoint",(lib==nullptr)?U_MISSING_RESOURCE_ERROR:*status,&subStatus);
}
if(lib!=nullptr) { /* still need to close the lib */
UErrorCode subStatus = U_ZERO_ERROR;
uplug_closeLibrary(lib, &subStatus); /* don't care here */
}
} else {
UErrorCode subStatus = U_ZERO_ERROR;
plug = uplug_initErrorPlug(libName,sym,config,"ERROR: could not load library",(lib==nullptr)?U_MISSING_RESOURCE_ERROR:*status,&subStatus);
}
return plug;
}
U_CAPI UPlugData* U_EXPORT2
uplug_loadPlugFromLibrary(const char *libName, const char *sym, const char *config, UErrorCode *status) {
UPlugData *plug = nullptr;
if(U_FAILURE(*status)) { return nullptr; }
plug = uplug_initPlugFromLibrary(libName, sym, config, status);
uplug_loadPlug(plug, status);
return plug;
}
#endif
static UPlugLevel gCurrentLevel = UPLUG_LEVEL_LOW;
U_CAPI UPlugLevel U_EXPORT2 uplug_getCurrentLevel() {
return gCurrentLevel;
}
static UBool U_CALLCONV uplug_cleanup()
{
int32_t i;
UPlugData *pluginToRemove;
/* cleanup plugs */
for(i=0;i<pluginCount;i++) {
UErrorCode subStatus = U_ZERO_ERROR;
pluginToRemove = &pluginList[i];
/* unload and deallocate */
uplug_doUnloadPlug(pluginToRemove, &subStatus);
}
/* close other held libs? */
gCurrentLevel = UPLUG_LEVEL_LOW;
return true;
}
#if U_ENABLE_DYLOAD
static void uplug_loadWaitingPlugs(UErrorCode *status) {
int32_t i;
UPlugLevel currentLevel = uplug_getCurrentLevel();
if(U_FAILURE(*status)) {
return;
}
#if UPLUG_TRACE
DBG((stderr, "uplug_loadWaitingPlugs() Level: %d\n", currentLevel));
#endif
/* pass #1: low level plugs */
for(i=0;i<pluginCount;i++) {
UErrorCode subStatus = U_ZERO_ERROR;
UPlugData *pluginToLoad = &pluginList[i];
if(pluginToLoad->awaitingLoad) {
if(pluginToLoad->level == UPLUG_LEVEL_LOW) {
if(currentLevel > UPLUG_LEVEL_LOW) {
pluginToLoad->pluginStatus = U_PLUGIN_TOO_HIGH;
} else {
UPlugLevel newLevel;
uplug_loadPlug(pluginToLoad, &subStatus);
newLevel = uplug_getCurrentLevel();
if(newLevel > currentLevel) {
pluginToLoad->pluginStatus = U_PLUGIN_CHANGED_LEVEL_WARNING;
currentLevel = newLevel;
}
}
pluginToLoad->awaitingLoad = false;
}
}
}
for(i=0;i<pluginCount;i++) {
UErrorCode subStatus = U_ZERO_ERROR;
UPlugData *pluginToLoad = &pluginList[i];
if(pluginToLoad->awaitingLoad) {
if(pluginToLoad->level == UPLUG_LEVEL_INVALID) {
pluginToLoad->pluginStatus = U_PLUGIN_DIDNT_SET_LEVEL;
} else if(pluginToLoad->level == UPLUG_LEVEL_UNKNOWN) {
pluginToLoad->pluginStatus = U_INTERNAL_PROGRAM_ERROR;
} else {
uplug_loadPlug(pluginToLoad, &subStatus);
}
pluginToLoad->awaitingLoad = false;
}
}
#if UPLUG_TRACE
DBG((stderr, " Done Loading Plugs. Level: %d\n", (int32_t)uplug_getCurrentLevel()));
#endif
}
/* Name of the plugin config file */
static char plugin_file[2048] = "";
#endif
U_CAPI const char* U_EXPORT2
uplug_getPluginFile() {
#if U_ENABLE_DYLOAD && !UCONFIG_NO_FILE_IO
return plugin_file;
#else
return nullptr;
#endif
}
// uplug_init() is called first thing from u_init().
U_CAPI void U_EXPORT2
uplug_init(UErrorCode *status) {
#if !U_ENABLE_DYLOAD
(void)status; /* unused */
#elif !UCONFIG_NO_FILE_IO
CharString plugin_dir;
const char *env = getenv("ICU_PLUGINS");
if(U_FAILURE(*status)) return;
if(env != nullptr) {
plugin_dir.append(env, -1, *status);
}
if(U_FAILURE(*status)) return;
#if defined(DEFAULT_ICU_PLUGINS)
if(plugin_dir.isEmpty()) {
plugin_dir.append(DEFAULT_ICU_PLUGINS, -1, *status);
}
#endif
#if UPLUG_TRACE
DBG((stderr, "ICU_PLUGINS=%s\n", plugin_dir.data()));
#endif
if(!plugin_dir.isEmpty()) {
FILE *f;
CharString pluginFile;
#ifdef OS390BATCH
/* There are potentially a lot of ways to implement a plugin directory on OS390/zOS */
/* Keeping in mind that unauthorized file access is logged, monitored, and enforced */
/* I've chosen to open a DDNAME if BATCH and leave it alone for (presumably) UNIX */
/* System Services. Alternative techniques might be allocating a member in */
/* SYS1.PARMLIB or setting an environment variable "ICU_PLUGIN_PATH" (?). The */
/* DDNAME can be connected to a file in the HFS if need be. */
pluginFile.append("//DD:ICUPLUG", -1, *status); /* JAM 20 Oct 2011 */
#else
pluginFile.append(plugin_dir, *status);
pluginFile.append(U_FILE_SEP_STRING, -1, *status);
pluginFile.append("icuplugins", -1, *status);
pluginFile.append(U_ICU_VERSION_SHORT, -1, *status);
pluginFile.append(".txt", -1, *status);
#endif
#if UPLUG_TRACE
DBG((stderr, "status=%s\n", u_errorName(*status)));
#endif
if(U_FAILURE(*status)) {
return;
}
if((size_t)pluginFile.length() > (sizeof(plugin_file)-1)) {
*status = U_BUFFER_OVERFLOW_ERROR;
#if UPLUG_TRACE
DBG((stderr, "status=%s\n", u_errorName(*status)));
#endif
return;
}
/* plugin_file is not used for processing - it is only used
so that uplug_getPluginFile() works (i.e. icuinfo)
*/
pluginFile.extract(plugin_file, sizeof(plugin_file), *status);
#if UPLUG_TRACE
DBG((stderr, "pluginfile= %s len %d/%d\n", plugin_file, (int)strlen(plugin_file), (int)sizeof(plugin_file)));
#endif
#ifdef __MVS__
if (iscics()) /* 12 Nov 2011 JAM */
{
f = nullptr;
}
else
#endif
{
f = fopen(pluginFile.data(), "r");
}
if(f != nullptr) {
char linebuf[1024];
char *p, *libName=nullptr, *symName=nullptr, *config=nullptr;
int32_t line = 0;
while(fgets(linebuf,1023,f)) {
line++;
if(!*linebuf || *linebuf=='#') {
continue;
} else {
p = linebuf;
while(*p&&isspace((int)*p))
p++;
if(!*p || *p=='#') continue;
libName = p;
while(*p&&!isspace((int)*p)) {
p++;
}
if(!*p || *p=='#') continue; /* no tab after libname */
*p=0; /* end of libname */
p++;
while(*p&&isspace((int)*p)) {
p++;
}
if(!*p||*p=='#') continue; /* no symname after libname +tab */
symName = p;
while(*p&&!isspace((int)*p)) {
p++;
}
if(*p) { /* has config */
*p=0;
++p;
while(*p&&isspace((int)*p)) {
p++;
}
if(*p) {
config = p;
}
}
/* chop whitespace at the end of the config */
if(config!=nullptr&&*config!=0) {
p = config+strlen(config);
while(p>config&&isspace((int)*(--p))) {
*p=0;
}
}
/* OK, we're good. */
{
UErrorCode subStatus = U_ZERO_ERROR;
UPlugData *plug = uplug_initPlugFromLibrary(libName, symName, config, &subStatus);
if(U_FAILURE(subStatus) && U_SUCCESS(*status)) {
*status = subStatus;
}
#if UPLUG_TRACE
DBG((stderr, "PLUGIN libName=[%s], sym=[%s], config=[%s]\n", libName, symName, config));
DBG((stderr, " -> %p, %s\n", (void*)plug, u_errorName(subStatus)));
#else
(void)plug; /* unused */
#endif
}
}
}
fclose(f);
} else {
#if UPLUG_TRACE
DBG((stderr, "Can't open plugin file %s\n", plugin_file));
#endif
}
}
uplug_loadWaitingPlugs(status);
#endif /* U_ENABLE_DYLOAD */
gCurrentLevel = UPLUG_LEVEL_HIGH;
ucln_registerCleanup(UCLN_UPLUG, uplug_cleanup);
}
#endif

View file

@ -0,0 +1,93 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
******************************************************************************
*
* Copyright (C) 2009-2015, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
*
* FILE NAME : icuplugimp.h
*
* Internal functions for the ICU plugin system
*
* Date Name Description
* 10/29/2009 sl New.
******************************************************************************
*/
#ifndef ICUPLUGIMP_H
#define ICUPLUGIMP_H
#include "unicode/icuplug.h"
#if UCONFIG_ENABLE_PLUGINS
/*========================*/
/** @{ Library Manipulation
*/
/**
* Open a library, adding a reference count if needed.
* @param libName library name to load
* @param status error code
* @return the library pointer, or NULL
* @internal internal use only
*/
U_CAPI void * U_EXPORT2
uplug_openLibrary(const char *libName, UErrorCode *status);
/**
* Close a library, if its reference count is 0
* @param lib the library to close
* @param status error code
* @internal internal use only
*/
U_CAPI void U_EXPORT2
uplug_closeLibrary(void *lib, UErrorCode *status);
/**
* Get a library's name, or NULL if not found.
* @param lib the library's name
* @param status error code
* @return the library name, or NULL if not found.
* @internal internal use only
*/
U_CAPI char * U_EXPORT2
uplug_findLibrary(void *lib, UErrorCode *status);
/** @} */
/*========================*/
/** {@ ICU Plugin internal interfaces
*/
/**
* Initialize the plugins
* @param status error result
* @internal - Internal use only.
*/
U_CAPI void U_EXPORT2
uplug_init(UErrorCode *status);
/**
* Get raw plug N
* @internal - Internal use only
*/
U_CAPI UPlugData* U_EXPORT2
uplug_getPlugInternal(int32_t n);
/**
* Get the name of the plugin file.
* @internal - Internal use only.
*/
U_CAPI const char* U_EXPORT2
uplug_getPluginFile(void);
/** @} */
#endif
#endif

View file

@ -0,0 +1,447 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
* Copyright (C) 2014, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* loadednormalizer2impl.cpp
*
* created on: 2014sep03
* created by: Markus W. Scherer
*/
#include "unicode/utypes.h"
#if !UCONFIG_NO_NORMALIZATION
#include "unicode/udata.h"
#include "unicode/localpointer.h"
#include "unicode/normalizer2.h"
#include "unicode/ucptrie.h"
#include "unicode/unistr.h"
#include "unicode/unorm.h"
#include "cstring.h"
#include "mutex.h"
#include "norm2allmodes.h"
#include "normalizer2impl.h"
#include "uassert.h"
#include "ucln_cmn.h"
#include "uhash.h"
U_NAMESPACE_BEGIN
class LoadedNormalizer2Impl : public Normalizer2Impl {
public:
LoadedNormalizer2Impl() : memory(nullptr), ownedTrie(nullptr) {}
virtual ~LoadedNormalizer2Impl();
void load(const char *packageName, const char *name, UErrorCode &errorCode);
private:
static UBool U_CALLCONV
isAcceptable(void *context, const char *type, const char *name, const UDataInfo *pInfo);
UDataMemory *memory;
UCPTrie *ownedTrie;
};
LoadedNormalizer2Impl::~LoadedNormalizer2Impl() {
udata_close(memory);
ucptrie_close(ownedTrie);
}
UBool U_CALLCONV
LoadedNormalizer2Impl::isAcceptable(void * /*context*/,
const char * /* type */, const char * /*name*/,
const UDataInfo *pInfo) {
if(
pInfo->size>=20 &&
pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
pInfo->charsetFamily==U_CHARSET_FAMILY &&
pInfo->dataFormat[0]==0x4e && /* dataFormat="Nrm2" */
pInfo->dataFormat[1]==0x72 &&
pInfo->dataFormat[2]==0x6d &&
pInfo->dataFormat[3]==0x32 &&
pInfo->formatVersion[0]==4
) {
// Normalizer2Impl *me=(Normalizer2Impl *)context;
// uprv_memcpy(me->dataVersion, pInfo->dataVersion, 4);
return true;
} else {
return false;
}
}
void
LoadedNormalizer2Impl::load(const char *packageName, const char *name, UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) {
return;
}
memory=udata_openChoice(packageName, "nrm", name, isAcceptable, this, &errorCode);
if(U_FAILURE(errorCode)) {
return;
}
const uint8_t *inBytes=(const uint8_t *)udata_getMemory(memory);
const int32_t *inIndexes=(const int32_t *)inBytes;
int32_t indexesLength=inIndexes[IX_NORM_TRIE_OFFSET]/4;
if(indexesLength<=IX_MIN_LCCC_CP) {
errorCode=U_INVALID_FORMAT_ERROR; // Not enough indexes.
return;
}
int32_t offset=inIndexes[IX_NORM_TRIE_OFFSET];
int32_t nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET];
ownedTrie=ucptrie_openFromBinary(UCPTRIE_TYPE_FAST, UCPTRIE_VALUE_BITS_16,
inBytes+offset, nextOffset-offset, nullptr,
&errorCode);
if(U_FAILURE(errorCode)) {
return;
}
offset=nextOffset;
nextOffset=inIndexes[IX_SMALL_FCD_OFFSET];
const uint16_t *inExtraData=(const uint16_t *)(inBytes+offset);
// smallFCD: new in formatVersion 2
offset=nextOffset;
const uint8_t *inSmallFCD=inBytes+offset;
init(inIndexes, ownedTrie, inExtraData, inSmallFCD);
}
// instance cache ---------------------------------------------------------- ***
Norm2AllModes *
Norm2AllModes::createInstance(const char *packageName,
const char *name,
UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) {
return nullptr;
}
LoadedNormalizer2Impl *impl=new LoadedNormalizer2Impl;
if(impl==nullptr) {
errorCode=U_MEMORY_ALLOCATION_ERROR;
return nullptr;
}
impl->load(packageName, name, errorCode);
return createInstance(impl, errorCode);
}
U_CDECL_BEGIN
static UBool U_CALLCONV uprv_loaded_normalizer2_cleanup();
U_CDECL_END
#if !NORM2_HARDCODE_NFC_DATA
static Norm2AllModes *nfcSingleton;
static icu::UInitOnce nfcInitOnce {};
#endif
static Norm2AllModes *nfkcSingleton;
static icu::UInitOnce nfkcInitOnce {};
static Norm2AllModes *nfkc_cfSingleton;
static icu::UInitOnce nfkc_cfInitOnce {};
static Norm2AllModes *nfkc_scfSingleton;
static icu::UInitOnce nfkc_scfInitOnce {};
static UHashtable *cache=nullptr;
// UInitOnce singleton initialization function
static void U_CALLCONV initSingletons(const char *what, UErrorCode &errorCode) {
#if !NORM2_HARDCODE_NFC_DATA
if (uprv_strcmp(what, "nfc") == 0) {
nfcSingleton = Norm2AllModes::createInstance(nullptr, "nfc", errorCode);
} else
#endif
if (uprv_strcmp(what, "nfkc") == 0) {
nfkcSingleton = Norm2AllModes::createInstance(nullptr, "nfkc", errorCode);
} else if (uprv_strcmp(what, "nfkc_cf") == 0) {
nfkc_cfSingleton = Norm2AllModes::createInstance(nullptr, "nfkc_cf", errorCode);
} else if (uprv_strcmp(what, "nfkc_scf") == 0) {
nfkc_scfSingleton = Norm2AllModes::createInstance(nullptr, "nfkc_scf", errorCode);
} else {
UPRV_UNREACHABLE_EXIT; // Unknown singleton
}
ucln_common_registerCleanup(UCLN_COMMON_LOADED_NORMALIZER2, uprv_loaded_normalizer2_cleanup);
}
U_CDECL_BEGIN
static void U_CALLCONV deleteNorm2AllModes(void *allModes) {
delete (Norm2AllModes *)allModes;
}
static UBool U_CALLCONV uprv_loaded_normalizer2_cleanup() {
#if !NORM2_HARDCODE_NFC_DATA
delete nfcSingleton;
nfcSingleton = nullptr;
nfcInitOnce.reset();
#endif
delete nfkcSingleton;
nfkcSingleton = nullptr;
nfkcInitOnce.reset();
delete nfkc_cfSingleton;
nfkc_cfSingleton = nullptr;
nfkc_cfInitOnce.reset();
delete nfkc_scfSingleton;
nfkc_scfSingleton = nullptr;
nfkc_scfInitOnce.reset();
uhash_close(cache);
cache=nullptr;
return true;
}
U_CDECL_END
#if !NORM2_HARDCODE_NFC_DATA
const Norm2AllModes *
Norm2AllModes::getNFCInstance(UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) { return nullptr; }
umtx_initOnce(nfcInitOnce, &initSingletons, "nfc", errorCode);
return nfcSingleton;
}
#endif
const Norm2AllModes *
Norm2AllModes::getNFKCInstance(UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) { return nullptr; }
umtx_initOnce(nfkcInitOnce, &initSingletons, "nfkc", errorCode);
return nfkcSingleton;
}
const Norm2AllModes *
Norm2AllModes::getNFKC_CFInstance(UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) { return nullptr; }
umtx_initOnce(nfkc_cfInitOnce, &initSingletons, "nfkc_cf", errorCode);
return nfkc_cfSingleton;
}
const Norm2AllModes *
Norm2AllModes::getNFKC_SCFInstance(UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) { return nullptr; }
umtx_initOnce(nfkc_scfInitOnce, &initSingletons, "nfkc_scf", errorCode);
return nfkc_scfSingleton;
}
#if !NORM2_HARDCODE_NFC_DATA
const Normalizer2 *
Normalizer2::getNFCInstance(UErrorCode &errorCode) {
const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
return allModes!=nullptr ? &allModes->comp : nullptr;
}
const Normalizer2 *
Normalizer2::getNFDInstance(UErrorCode &errorCode) {
const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
return allModes!=nullptr ? &allModes->decomp : nullptr;
}
const Normalizer2 *Normalizer2Factory::getFCDInstance(UErrorCode &errorCode) {
const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
return allModes!=nullptr ? &allModes->fcd : nullptr;
}
const Normalizer2 *Normalizer2Factory::getFCCInstance(UErrorCode &errorCode) {
const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
return allModes!=nullptr ? &allModes->fcc : nullptr;
}
const Normalizer2Impl *
Normalizer2Factory::getNFCImpl(UErrorCode &errorCode) {
const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
return allModes!=nullptr ? allModes->impl : nullptr;
}
#endif
const Normalizer2 *
Normalizer2::getNFKCInstance(UErrorCode &errorCode) {
const Norm2AllModes *allModes=Norm2AllModes::getNFKCInstance(errorCode);
return allModes!=nullptr ? &allModes->comp : nullptr;
}
const Normalizer2 *
Normalizer2::getNFKDInstance(UErrorCode &errorCode) {
const Norm2AllModes *allModes=Norm2AllModes::getNFKCInstance(errorCode);
return allModes!=nullptr ? &allModes->decomp : nullptr;
}
const Normalizer2 *
Normalizer2::getNFKCCasefoldInstance(UErrorCode &errorCode) {
const Norm2AllModes *allModes=Norm2AllModes::getNFKC_CFInstance(errorCode);
return allModes!=nullptr ? &allModes->comp : nullptr;
}
const Normalizer2 *
Normalizer2::getNFKCSimpleCasefoldInstance(UErrorCode &errorCode) {
const Norm2AllModes *allModes=Norm2AllModes::getNFKC_SCFInstance(errorCode);
return allModes!=nullptr ? &allModes->comp : nullptr;
}
const Normalizer2 *
Normalizer2::getInstance(const char *packageName,
const char *name,
UNormalization2Mode mode,
UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) {
return nullptr;
}
if(name==nullptr || *name==0) {
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
return nullptr;
}
const Norm2AllModes *allModes=nullptr;
if(packageName==nullptr) {
if(0==uprv_strcmp(name, "nfc")) {
allModes=Norm2AllModes::getNFCInstance(errorCode);
} else if(0==uprv_strcmp(name, "nfkc")) {
allModes=Norm2AllModes::getNFKCInstance(errorCode);
} else if(0==uprv_strcmp(name, "nfkc_cf")) {
allModes=Norm2AllModes::getNFKC_CFInstance(errorCode);
} else if(0==uprv_strcmp(name, "nfkc_scf")) {
allModes=Norm2AllModes::getNFKC_SCFInstance(errorCode);
}
}
if(allModes==nullptr && U_SUCCESS(errorCode)) {
{
Mutex lock;
if(cache!=nullptr) {
allModes=(Norm2AllModes *)uhash_get(cache, name);
}
}
if(allModes==nullptr) {
ucln_common_registerCleanup(UCLN_COMMON_LOADED_NORMALIZER2, uprv_loaded_normalizer2_cleanup);
LocalPointer<Norm2AllModes> localAllModes(
Norm2AllModes::createInstance(packageName, name, errorCode));
if(U_SUCCESS(errorCode)) {
Mutex lock;
if(cache==nullptr) {
cache=uhash_open(uhash_hashChars, uhash_compareChars, nullptr, &errorCode);
if(U_FAILURE(errorCode)) {
return nullptr;
}
uhash_setKeyDeleter(cache, uprv_free);
uhash_setValueDeleter(cache, deleteNorm2AllModes);
}
void *temp=uhash_get(cache, name);
if(temp==nullptr) {
int32_t keyLength= static_cast<int32_t>(uprv_strlen(name)+1);
char *nameCopy=(char *)uprv_malloc(keyLength);
if(nameCopy==nullptr) {
errorCode=U_MEMORY_ALLOCATION_ERROR;
return nullptr;
}
uprv_memcpy(nameCopy, name, keyLength);
allModes=localAllModes.getAlias();
uhash_put(cache, nameCopy, localAllModes.orphan(), &errorCode);
} else {
// race condition
allModes=(Norm2AllModes *)temp;
}
}
}
}
if(allModes!=nullptr && U_SUCCESS(errorCode)) {
switch(mode) {
case UNORM2_COMPOSE:
return &allModes->comp;
case UNORM2_DECOMPOSE:
return &allModes->decomp;
case UNORM2_FCD:
return &allModes->fcd;
case UNORM2_COMPOSE_CONTIGUOUS:
return &allModes->fcc;
default:
break; // do nothing
}
}
return nullptr;
}
const Normalizer2 *
Normalizer2Factory::getInstance(UNormalizationMode mode, UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) {
return nullptr;
}
switch(mode) {
case UNORM_NFD:
return Normalizer2::getNFDInstance(errorCode);
case UNORM_NFKD:
return Normalizer2::getNFKDInstance(errorCode);
case UNORM_NFC:
return Normalizer2::getNFCInstance(errorCode);
case UNORM_NFKC:
return Normalizer2::getNFKCInstance(errorCode);
case UNORM_FCD:
return getFCDInstance(errorCode);
default: // UNORM_NONE
return getNoopInstance(errorCode);
}
}
const Normalizer2Impl *
Normalizer2Factory::getNFKCImpl(UErrorCode &errorCode) {
const Norm2AllModes *allModes=Norm2AllModes::getNFKCInstance(errorCode);
return allModes!=nullptr ? allModes->impl : nullptr;
}
const Normalizer2Impl *
Normalizer2Factory::getNFKC_CFImpl(UErrorCode &errorCode) {
const Norm2AllModes *allModes=Norm2AllModes::getNFKC_CFInstance(errorCode);
return allModes!=nullptr ? allModes->impl : nullptr;
}
U_NAMESPACE_END
// C API ------------------------------------------------------------------- ***
U_NAMESPACE_USE
U_CAPI const UNormalizer2 * U_EXPORT2
unorm2_getNFKCInstance(UErrorCode *pErrorCode) {
return (const UNormalizer2 *)Normalizer2::getNFKCInstance(*pErrorCode);
}
U_CAPI const UNormalizer2 * U_EXPORT2
unorm2_getNFKDInstance(UErrorCode *pErrorCode) {
return (const UNormalizer2 *)Normalizer2::getNFKDInstance(*pErrorCode);
}
U_CAPI const UNormalizer2 * U_EXPORT2
unorm2_getNFKCCasefoldInstance(UErrorCode *pErrorCode) {
return (const UNormalizer2 *)Normalizer2::getNFKCCasefoldInstance(*pErrorCode);
}
U_CAPI const UNormalizer2 * U_EXPORT2
unorm2_getNFKCSimpleCasefoldInstance(UErrorCode *pErrorCode) {
return (const UNormalizer2 *)Normalizer2::getNFKCSimpleCasefoldInstance(*pErrorCode);
}
U_CAPI const UNormalizer2 * U_EXPORT2
unorm2_getInstance(const char *packageName,
const char *name,
UNormalization2Mode mode,
UErrorCode *pErrorCode) {
return (const UNormalizer2 *)Normalizer2::getInstance(packageName, name, mode, *pErrorCode);
}
U_CFUNC UNormalizationCheckResult
unorm_getQuickCheck(UChar32 c, UNormalizationMode mode) {
if(mode<=UNORM_NONE || UNORM_FCD<=mode) {
return UNORM_YES;
}
UErrorCode errorCode=U_ZERO_ERROR;
const Normalizer2 *norm2=Normalizer2Factory::getInstance(mode, errorCode);
if(U_SUCCESS(errorCode)) {
return ((const Normalizer2WithImpl *)norm2)->getQuickCheck(c);
} else {
return UNORM_MAYBE;
}
}
#endif // !UCONFIG_NO_NORMALIZATION

View file

@ -0,0 +1,480 @@
// © 2019 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
#include <utility>
#include "bytesinkutil.h" // StringByteSink<CharString>
#include "charstr.h"
#include "cstring.h"
#include "ulocimp.h"
#include "unicode/localebuilder.h"
#include "unicode/locid.h"
namespace {
inline bool UPRV_ISDIGIT(char c) { return c >= '0' && c <= '9'; }
inline bool UPRV_ISALPHANUM(char c) { return uprv_isASCIILetter(c) || UPRV_ISDIGIT(c); }
constexpr const char* kAttributeKey = "attribute";
bool _isExtensionSubtags(char key, const char* s, int32_t len) {
switch (uprv_tolower(key)) {
case 'u':
return ultag_isUnicodeExtensionSubtags(s, len);
case 't':
return ultag_isTransformedExtensionSubtags(s, len);
case 'x':
return ultag_isPrivateuseValueSubtags(s, len);
default:
return ultag_isExtensionSubtags(s, len);
}
}
} // namespace
U_NAMESPACE_BEGIN
LocaleBuilder::LocaleBuilder() : UObject(), status_(U_ZERO_ERROR), language_(),
script_(), region_(), variant_(nullptr), extensions_(nullptr)
{
language_[0] = 0;
script_[0] = 0;
region_[0] = 0;
}
LocaleBuilder::~LocaleBuilder()
{
delete variant_;
delete extensions_;
}
LocaleBuilder& LocaleBuilder::setLocale(const Locale& locale)
{
clear();
setLanguage(locale.getLanguage());
setScript(locale.getScript());
setRegion(locale.getCountry());
setVariant(locale.getVariant());
extensions_ = locale.clone();
if (extensions_ == nullptr) {
status_ = U_MEMORY_ALLOCATION_ERROR;
}
return *this;
}
LocaleBuilder& LocaleBuilder::setLanguageTag(StringPiece tag)
{
Locale l = Locale::forLanguageTag(tag, status_);
if (U_FAILURE(status_)) { return *this; }
// Because setLocale will reset status_ we need to return
// first if we have error in forLanguageTag.
setLocale(l);
return *this;
}
namespace {
void setField(StringPiece input, char* dest, UErrorCode& errorCode,
bool (*test)(const char*, int32_t)) {
if (U_FAILURE(errorCode)) { return; }
if (input.empty()) {
dest[0] = '\0';
} else if (test(input.data(), input.length())) {
uprv_memcpy(dest, input.data(), input.length());
dest[input.length()] = '\0';
} else {
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
}
}
} // namespace
LocaleBuilder& LocaleBuilder::setLanguage(StringPiece language)
{
setField(language, language_, status_, &ultag_isLanguageSubtag);
return *this;
}
LocaleBuilder& LocaleBuilder::setScript(StringPiece script)
{
setField(script, script_, status_, &ultag_isScriptSubtag);
return *this;
}
LocaleBuilder& LocaleBuilder::setRegion(StringPiece region)
{
setField(region, region_, status_, &ultag_isRegionSubtag);
return *this;
}
namespace {
void transform(char* data, int32_t len) {
for (int32_t i = 0; i < len; i++, data++) {
if (*data == '_') {
*data = '-';
} else {
*data = uprv_tolower(*data);
}
}
}
} // namespace
LocaleBuilder& LocaleBuilder::setVariant(StringPiece variant)
{
if (U_FAILURE(status_)) { return *this; }
if (variant.empty()) {
delete variant_;
variant_ = nullptr;
return *this;
}
CharString* new_variant = new CharString(variant, status_);
if (U_FAILURE(status_)) { return *this; }
if (new_variant == nullptr) {
status_ = U_MEMORY_ALLOCATION_ERROR;
return *this;
}
transform(new_variant->data(), new_variant->length());
if (!ultag_isVariantSubtags(new_variant->data(), new_variant->length())) {
delete new_variant;
status_ = U_ILLEGAL_ARGUMENT_ERROR;
return *this;
}
delete variant_;
variant_ = new_variant;
return *this;
}
namespace {
bool
_isKeywordValue(const char* key, const char* value, int32_t value_len)
{
if (key[1] == '\0') {
// one char key
return (UPRV_ISALPHANUM(uprv_tolower(key[0])) &&
_isExtensionSubtags(key[0], value, value_len));
} else if (uprv_strcmp(key, kAttributeKey) == 0) {
// unicode attributes
return ultag_isUnicodeLocaleAttributes(value, value_len);
}
// otherwise: unicode extension value
// We need to convert from legacy key/value to unicode
// key/value
const char* unicode_locale_key = uloc_toUnicodeLocaleKey(key);
const char* unicode_locale_type = uloc_toUnicodeLocaleType(key, value);
return unicode_locale_key && unicode_locale_type &&
ultag_isUnicodeLocaleKey(unicode_locale_key, -1) &&
ultag_isUnicodeLocaleType(unicode_locale_type, -1);
}
void
_copyExtensions(const Locale& from, icu::StringEnumeration *keywords,
Locale& to, bool validate, UErrorCode& errorCode)
{
if (U_FAILURE(errorCode)) { return; }
LocalPointer<icu::StringEnumeration> ownedKeywords;
if (keywords == nullptr) {
ownedKeywords.adoptInstead(from.createKeywords(errorCode));
if (U_FAILURE(errorCode) || ownedKeywords.isNull()) { return; }
keywords = ownedKeywords.getAlias();
}
const char* key;
while ((key = keywords->next(nullptr, errorCode)) != nullptr) {
auto value = from.getKeywordValue<CharString>(key, errorCode);
if (U_FAILURE(errorCode)) { return; }
if (uprv_strcmp(key, kAttributeKey) == 0) {
transform(value.data(), value.length());
}
if (validate &&
!_isKeywordValue(key, value.data(), value.length())) {
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
to.setKeywordValue(key, value.data(), errorCode);
if (U_FAILURE(errorCode)) { return; }
}
}
void
_clearUAttributesAndKeyType(Locale& locale, UErrorCode& errorCode)
{
if (U_FAILURE(errorCode)) { return; }
// Clear Unicode attributes
locale.setKeywordValue(kAttributeKey, "", errorCode);
// Clear all Unicode keyword values
LocalPointer<icu::StringEnumeration> iter(locale.createUnicodeKeywords(errorCode));
if (U_FAILURE(errorCode) || iter.isNull()) { return; }
const char* key;
while ((key = iter->next(nullptr, errorCode)) != nullptr) {
locale.setUnicodeKeywordValue(key, nullptr, errorCode);
}
}
void
_setUnicodeExtensions(Locale& locale, const CharString& value, UErrorCode& errorCode)
{
if (U_FAILURE(errorCode)) { return; }
// Add the unicode extensions to extensions_
CharString locale_str("und-u-", errorCode);
locale_str.append(value, errorCode);
_copyExtensions(
Locale::forLanguageTag(locale_str.data(), errorCode), nullptr,
locale, false, errorCode);
}
} // namespace
LocaleBuilder& LocaleBuilder::setExtension(char key, StringPiece value)
{
if (U_FAILURE(status_)) { return *this; }
if (!UPRV_ISALPHANUM(key)) {
status_ = U_ILLEGAL_ARGUMENT_ERROR;
return *this;
}
CharString value_str(value, status_);
if (U_FAILURE(status_)) { return *this; }
transform(value_str.data(), value_str.length());
if (!value_str.isEmpty() &&
!_isExtensionSubtags(key, value_str.data(), value_str.length())) {
status_ = U_ILLEGAL_ARGUMENT_ERROR;
return *this;
}
if (extensions_ == nullptr) {
extensions_ = Locale::getRoot().clone();
if (extensions_ == nullptr) {
status_ = U_MEMORY_ALLOCATION_ERROR;
return *this;
}
}
if (uprv_tolower(key) != 'u') {
// for t, x and others extension.
extensions_->setKeywordValue(StringPiece(&key, 1), value_str.data(),
status_);
return *this;
}
_clearUAttributesAndKeyType(*extensions_, status_);
if (U_FAILURE(status_)) { return *this; }
if (!value.empty()) {
_setUnicodeExtensions(*extensions_, value_str, status_);
}
return *this;
}
LocaleBuilder& LocaleBuilder::setUnicodeLocaleKeyword(
StringPiece key, StringPiece type)
{
if (U_FAILURE(status_)) { return *this; }
if (!ultag_isUnicodeLocaleKey(key.data(), key.length()) ||
(!type.empty() &&
!ultag_isUnicodeLocaleType(type.data(), type.length()))) {
status_ = U_ILLEGAL_ARGUMENT_ERROR;
return *this;
}
if (extensions_ == nullptr) {
extensions_ = Locale::getRoot().clone();
if (extensions_ == nullptr) {
status_ = U_MEMORY_ALLOCATION_ERROR;
return *this;
}
}
extensions_->setUnicodeKeywordValue(key, type, status_);
return *this;
}
LocaleBuilder& LocaleBuilder::addUnicodeLocaleAttribute(
StringPiece value)
{
CharString value_str(value, status_);
if (U_FAILURE(status_)) { return *this; }
transform(value_str.data(), value_str.length());
if (!ultag_isUnicodeLocaleAttribute(value_str.data(), value_str.length())) {
status_ = U_ILLEGAL_ARGUMENT_ERROR;
return *this;
}
if (extensions_ == nullptr) {
extensions_ = Locale::getRoot().clone();
if (extensions_ == nullptr) {
status_ = U_MEMORY_ALLOCATION_ERROR;
return *this;
}
extensions_->setKeywordValue(kAttributeKey, value_str.data(), status_);
return *this;
}
UErrorCode localErrorCode = U_ZERO_ERROR;
auto attributes = extensions_->getKeywordValue<CharString>(kAttributeKey, localErrorCode);
if (U_FAILURE(localErrorCode)) {
CharString new_attributes(value_str.data(), status_);
// No attributes, set the attribute.
extensions_->setKeywordValue(kAttributeKey, new_attributes.data(), status_);
return *this;
}
transform(attributes.data(),attributes.length());
const char* start = attributes.data();
const char* limit = attributes.data() + attributes.length();
CharString new_attributes;
bool inserted = false;
while (start < limit) {
if (!inserted) {
int cmp = uprv_strcmp(start, value_str.data());
if (cmp == 0) { return *this; } // Found it in attributes: Just return
if (cmp > 0) {
if (!new_attributes.isEmpty()) new_attributes.append('_', status_);
new_attributes.append(value_str.data(), status_);
inserted = true;
}
}
if (!new_attributes.isEmpty()) {
new_attributes.append('_', status_);
}
new_attributes.append(start, status_);
start += uprv_strlen(start) + 1;
}
if (!inserted) {
if (!new_attributes.isEmpty()) {
new_attributes.append('_', status_);
}
new_attributes.append(value_str.data(), status_);
}
// Not yet in the attributes, set the attribute.
extensions_->setKeywordValue(kAttributeKey, new_attributes.data(), status_);
return *this;
}
LocaleBuilder& LocaleBuilder::removeUnicodeLocaleAttribute(
StringPiece value)
{
CharString value_str(value, status_);
if (U_FAILURE(status_)) { return *this; }
transform(value_str.data(), value_str.length());
if (!ultag_isUnicodeLocaleAttribute(value_str.data(), value_str.length())) {
status_ = U_ILLEGAL_ARGUMENT_ERROR;
return *this;
}
if (extensions_ == nullptr) { return *this; }
UErrorCode localErrorCode = U_ZERO_ERROR;
auto attributes = extensions_->getKeywordValue<CharString>(kAttributeKey, localErrorCode);
// get failure, just return
if (U_FAILURE(localErrorCode)) { return *this; }
// Do not have any attributes, just return.
if (attributes.isEmpty()) { return *this; }
char* p = attributes.data();
// Replace null terminiator in place for _ and - so later
// we can use uprv_strcmp to compare.
for (int32_t i = 0; i < attributes.length(); i++, p++) {
*p = (*p == '_' || *p == '-') ? '\0' : uprv_tolower(*p);
}
const char* start = attributes.data();
const char* limit = attributes.data() + attributes.length();
CharString new_attributes;
bool found = false;
while (start < limit) {
if (uprv_strcmp(start, value_str.data()) == 0) {
found = true;
} else {
if (!new_attributes.isEmpty()) {
new_attributes.append('_', status_);
}
new_attributes.append(start, status_);
}
start += uprv_strlen(start) + 1;
}
// Found the value in attributes, set the attribute.
if (found) {
extensions_->setKeywordValue(kAttributeKey, new_attributes.data(), status_);
}
return *this;
}
LocaleBuilder& LocaleBuilder::clear()
{
status_ = U_ZERO_ERROR;
language_[0] = 0;
script_[0] = 0;
region_[0] = 0;
delete variant_;
variant_ = nullptr;
clearExtensions();
return *this;
}
LocaleBuilder& LocaleBuilder::clearExtensions()
{
delete extensions_;
extensions_ = nullptr;
return *this;
}
Locale makeBogusLocale() {
Locale bogus;
bogus.setToBogus();
return bogus;
}
void LocaleBuilder::copyExtensionsFrom(const Locale& src, UErrorCode& errorCode)
{
if (U_FAILURE(errorCode)) { return; }
LocalPointer<icu::StringEnumeration> keywords(src.createKeywords(errorCode));
if (U_FAILURE(errorCode) || keywords.isNull() || keywords->count(errorCode) == 0) {
// Error, or no extensions to copy.
return;
}
if (extensions_ == nullptr) {
extensions_ = Locale::getRoot().clone();
if (extensions_ == nullptr) {
status_ = U_MEMORY_ALLOCATION_ERROR;
return;
}
}
_copyExtensions(src, keywords.getAlias(), *extensions_, false, errorCode);
}
Locale LocaleBuilder::build(UErrorCode& errorCode)
{
if (U_FAILURE(errorCode)) {
return makeBogusLocale();
}
if (U_FAILURE(status_)) {
errorCode = status_;
return makeBogusLocale();
}
CharString locale_str(language_, errorCode);
if (uprv_strlen(script_) > 0) {
locale_str.append('-', errorCode).append(StringPiece(script_), errorCode);
}
if (uprv_strlen(region_) > 0) {
locale_str.append('-', errorCode).append(StringPiece(region_), errorCode);
}
if (variant_ != nullptr) {
locale_str.append('-', errorCode).append(StringPiece(variant_->data()), errorCode);
}
if (U_FAILURE(errorCode)) {
return makeBogusLocale();
}
Locale product(locale_str.data());
if (extensions_ != nullptr) {
_copyExtensions(*extensions_, nullptr, product, true, errorCode);
}
if (U_FAILURE(errorCode)) {
return makeBogusLocale();
}
return product;
}
UBool LocaleBuilder::copyErrorTo(UErrorCode &outErrorCode) const {
if (U_FAILURE(outErrorCode)) {
// Do not overwrite the older error code
return true;
}
outErrorCode = status_;
return U_FAILURE(outErrorCode);
}
U_NAMESPACE_END

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,834 @@
// © 2019 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
// localematcher.cpp
// created: 2019may08 Markus W. Scherer
#include <optional>
#include "unicode/utypes.h"
#include "unicode/localebuilder.h"
#include "unicode/localematcher.h"
#include "unicode/locid.h"
#include "unicode/stringpiece.h"
#include "unicode/uloc.h"
#include "unicode/uobject.h"
#include "cstring.h"
#include "localeprioritylist.h"
#include "loclikelysubtags.h"
#include "locdistance.h"
#include "lsr.h"
#include "uassert.h"
#include "uhash.h"
#include "ustr_imp.h"
#include "uvector.h"
#define UND_LSR LSR("und", "", "", LSR::EXPLICIT_LSR)
/**
* Indicator for the lifetime of desired-locale objects passed into the LocaleMatcher.
*
* @draft ICU 65
*/
enum ULocMatchLifetime {
/**
* Locale objects are temporary.
* The matcher will make a copy of a locale that will be used beyond one function call.
*
* @draft ICU 65
*/
ULOCMATCH_TEMPORARY_LOCALES,
/**
* Locale objects are stored at least as long as the matcher is used.
* The matcher will keep only a pointer to a locale that will be used beyond one function call,
* avoiding a copy.
*
* @draft ICU 65
*/
ULOCMATCH_STORED_LOCALES // TODO: permanent? cached? clone?
};
#ifndef U_IN_DOXYGEN
typedef enum ULocMatchLifetime ULocMatchLifetime;
#endif
U_NAMESPACE_BEGIN
LocaleMatcher::Result::Result(LocaleMatcher::Result &&src) noexcept :
desiredLocale(src.desiredLocale),
supportedLocale(src.supportedLocale),
desiredIndex(src.desiredIndex),
supportedIndex(src.supportedIndex),
desiredIsOwned(src.desiredIsOwned) {
if (desiredIsOwned) {
src.desiredLocale = nullptr;
src.desiredIndex = -1;
src.desiredIsOwned = false;
}
}
LocaleMatcher::Result::~Result() {
if (desiredIsOwned) {
delete desiredLocale;
}
}
LocaleMatcher::Result &LocaleMatcher::Result::operator=(LocaleMatcher::Result &&src) noexcept {
this->~Result();
desiredLocale = src.desiredLocale;
supportedLocale = src.supportedLocale;
desiredIndex = src.desiredIndex;
supportedIndex = src.supportedIndex;
desiredIsOwned = src.desiredIsOwned;
if (desiredIsOwned) {
src.desiredLocale = nullptr;
src.desiredIndex = -1;
src.desiredIsOwned = false;
}
return *this;
}
Locale LocaleMatcher::Result::makeResolvedLocale(UErrorCode &errorCode) const {
if (U_FAILURE(errorCode) || supportedLocale == nullptr) {
return Locale::getRoot();
}
const Locale *bestDesired = getDesiredLocale();
if (bestDesired == nullptr || *supportedLocale == *bestDesired) {
return *supportedLocale;
}
LocaleBuilder b;
b.setLocale(*supportedLocale);
// Copy the region from bestDesired, if there is one.
const char *region = bestDesired->getCountry();
if (*region != 0) {
b.setRegion(region);
}
// Copy the variants from bestDesired, if there are any.
// Note that this will override any supportedLocale variants.
// For example, "sco-ulster-fonipa" + "...-fonupa" => "sco-fonupa" (replacing ulster).
const char *variants = bestDesired->getVariant();
if (*variants != 0) {
b.setVariant(variants);
}
// Copy the extensions from bestDesired, if there are any.
// C++ note: The following note, copied from Java, may not be true,
// as long as C++ copies by legacy ICU keyword, not by extension singleton.
// Note that this will override any supportedLocale extensions.
// For example, "th-u-nu-latn-ca-buddhist" + "...-u-nu-native" => "th-u-nu-native"
// (replacing calendar).
b.copyExtensionsFrom(*bestDesired, errorCode);
return b.build(errorCode);
}
LocaleMatcher::Builder::Builder(LocaleMatcher::Builder &&src) noexcept :
errorCode_(src.errorCode_),
supportedLocales_(src.supportedLocales_),
thresholdDistance_(src.thresholdDistance_),
demotion_(src.demotion_),
defaultLocale_(src.defaultLocale_),
withDefault_(src.withDefault_),
favor_(src.favor_),
direction_(src.direction_) {
src.supportedLocales_ = nullptr;
src.defaultLocale_ = nullptr;
}
LocaleMatcher::Builder::~Builder() {
delete supportedLocales_;
delete defaultLocale_;
delete maxDistanceDesired_;
delete maxDistanceSupported_;
}
LocaleMatcher::Builder &LocaleMatcher::Builder::operator=(LocaleMatcher::Builder &&src) noexcept {
this->~Builder();
errorCode_ = src.errorCode_;
supportedLocales_ = src.supportedLocales_;
thresholdDistance_ = src.thresholdDistance_;
demotion_ = src.demotion_;
defaultLocale_ = src.defaultLocale_;
withDefault_ = src.withDefault_,
favor_ = src.favor_;
direction_ = src.direction_;
src.supportedLocales_ = nullptr;
src.defaultLocale_ = nullptr;
return *this;
}
void LocaleMatcher::Builder::clearSupportedLocales() {
if (supportedLocales_ != nullptr) {
supportedLocales_->removeAllElements();
}
}
bool LocaleMatcher::Builder::ensureSupportedLocaleVector() {
if (U_FAILURE(errorCode_)) { return false; }
if (supportedLocales_ != nullptr) { return true; }
LocalPointer<UVector> lpSupportedLocales(new UVector(uprv_deleteUObject, nullptr, errorCode_), errorCode_);
if (U_FAILURE(errorCode_)) { return false; }
supportedLocales_ = lpSupportedLocales.orphan();
return true;
}
LocaleMatcher::Builder &LocaleMatcher::Builder::setSupportedLocalesFromListString(
StringPiece locales) {
LocalePriorityList list(locales, errorCode_);
if (U_FAILURE(errorCode_)) { return *this; }
clearSupportedLocales();
if (!ensureSupportedLocaleVector()) { return *this; }
int32_t length = list.getLengthIncludingRemoved();
for (int32_t i = 0; i < length; ++i) {
Locale *locale = list.orphanLocaleAt(i);
if (locale == nullptr) { continue; }
supportedLocales_->adoptElement(locale, errorCode_);
if (U_FAILURE(errorCode_)) {
break;
}
}
return *this;
}
LocaleMatcher::Builder &LocaleMatcher::Builder::setSupportedLocales(Locale::Iterator &locales) {
if (ensureSupportedLocaleVector()) {
clearSupportedLocales();
while (locales.hasNext() && U_SUCCESS(errorCode_)) {
const Locale &locale = locales.next();
LocalPointer<Locale> clone (locale.clone(), errorCode_);
supportedLocales_->adoptElement(clone.orphan(), errorCode_);
}
}
return *this;
}
LocaleMatcher::Builder &LocaleMatcher::Builder::addSupportedLocale(const Locale &locale) {
if (ensureSupportedLocaleVector()) {
LocalPointer<Locale> clone(locale.clone(), errorCode_);
supportedLocales_->adoptElement(clone.orphan(), errorCode_);
}
return *this;
}
LocaleMatcher::Builder &LocaleMatcher::Builder::setNoDefaultLocale() {
if (U_FAILURE(errorCode_)) { return *this; }
delete defaultLocale_;
defaultLocale_ = nullptr;
withDefault_ = false;
return *this;
}
LocaleMatcher::Builder &LocaleMatcher::Builder::setDefaultLocale(const Locale *defaultLocale) {
if (U_FAILURE(errorCode_)) { return *this; }
Locale *clone = nullptr;
if (defaultLocale != nullptr) {
clone = defaultLocale->clone();
if (clone == nullptr) {
errorCode_ = U_MEMORY_ALLOCATION_ERROR;
return *this;
}
}
delete defaultLocale_;
defaultLocale_ = clone;
withDefault_ = true;
return *this;
}
LocaleMatcher::Builder &LocaleMatcher::Builder::setFavorSubtag(ULocMatchFavorSubtag subtag) {
if (U_FAILURE(errorCode_)) { return *this; }
favor_ = subtag;
return *this;
}
LocaleMatcher::Builder &LocaleMatcher::Builder::setDemotionPerDesiredLocale(ULocMatchDemotion demotion) {
if (U_FAILURE(errorCode_)) { return *this; }
demotion_ = demotion;
return *this;
}
LocaleMatcher::Builder &LocaleMatcher::Builder::setMaxDistance(const Locale &desired,
const Locale &supported) {
if (U_FAILURE(errorCode_)) { return *this; }
Locale *desiredClone = desired.clone();
Locale *supportedClone = supported.clone();
if (desiredClone == nullptr || supportedClone == nullptr) {
delete desiredClone; // in case only one could not be allocated
delete supportedClone;
errorCode_ = U_MEMORY_ALLOCATION_ERROR;
return *this;
}
delete maxDistanceDesired_;
delete maxDistanceSupported_;
maxDistanceDesired_ = desiredClone;
maxDistanceSupported_ = supportedClone;
return *this;
}
#if 0
/**
* <i>Internal only!</i>
*
* @param thresholdDistance the thresholdDistance to set, with -1 = default
* @return this Builder object
* @internal
* @deprecated This API is ICU internal only.
*/
@Deprecated
LocaleMatcher::Builder &LocaleMatcher::Builder::internalSetThresholdDistance(int32_t thresholdDistance) {
if (U_FAILURE(errorCode_)) { return *this; }
if (thresholdDistance > 100) {
thresholdDistance = 100;
}
thresholdDistance_ = thresholdDistance;
return *this;
}
#endif
UBool LocaleMatcher::Builder::copyErrorTo(UErrorCode &outErrorCode) const {
if (U_FAILURE(outErrorCode)) { return true; }
if (U_SUCCESS(errorCode_)) { return false; }
outErrorCode = errorCode_;
return true;
}
LocaleMatcher LocaleMatcher::Builder::build(UErrorCode &errorCode) const {
if (U_SUCCESS(errorCode) && U_FAILURE(errorCode_)) {
errorCode = errorCode_;
}
return LocaleMatcher(*this, errorCode);
}
namespace {
LSR getMaximalLsrOrUnd(const LikelySubtags &likelySubtags, const Locale &locale,
UErrorCode &errorCode) {
if (U_FAILURE(errorCode) || locale.isBogus() || *locale.getName() == 0 /* "und" */) {
return UND_LSR;
} else {
return likelySubtags.makeMaximizedLsrFrom(locale, false, errorCode);
}
}
int32_t hashLSR(const UHashTok token) {
const LSR *lsr = static_cast<const LSR *>(token.pointer);
return lsr->hashCode;
}
UBool compareLSRs(const UHashTok t1, const UHashTok t2) {
const LSR *lsr1 = static_cast<const LSR *>(t1.pointer);
const LSR *lsr2 = static_cast<const LSR *>(t2.pointer);
return *lsr1 == *lsr2;
}
} // namespace
int32_t LocaleMatcher::putIfAbsent(const LSR &lsr, int32_t i, int32_t suppLength,
UErrorCode &errorCode) {
if (U_FAILURE(errorCode)) { return suppLength; }
if (!uhash_containsKey(supportedLsrToIndex, &lsr)) {
uhash_putiAllowZero(supportedLsrToIndex, const_cast<LSR *>(&lsr), i, &errorCode);
if (U_SUCCESS(errorCode)) {
supportedLSRs[suppLength] = &lsr;
supportedIndexes[suppLength++] = i;
}
}
return suppLength;
}
LocaleMatcher::LocaleMatcher(const Builder &builder, UErrorCode &errorCode) :
likelySubtags(*LikelySubtags::getSingleton(errorCode)),
localeDistance(*LocaleDistance::getSingleton(errorCode)),
thresholdDistance(builder.thresholdDistance_),
demotionPerDesiredLocale(0),
favorSubtag(builder.favor_),
direction(builder.direction_),
supportedLocales(nullptr), lsrs(nullptr), supportedLocalesLength(0),
supportedLsrToIndex(nullptr),
supportedLSRs(nullptr), supportedIndexes(nullptr), supportedLSRsLength(0),
ownedDefaultLocale(nullptr), defaultLocale(nullptr) {
if (U_FAILURE(errorCode)) { return; }
const Locale *def = builder.defaultLocale_;
LSR builderDefaultLSR;
const LSR *defLSR = nullptr;
if (def != nullptr) {
ownedDefaultLocale = def->clone();
if (ownedDefaultLocale == nullptr) {
errorCode = U_MEMORY_ALLOCATION_ERROR;
return;
}
def = ownedDefaultLocale;
builderDefaultLSR = getMaximalLsrOrUnd(likelySubtags, *def, errorCode);
if (U_FAILURE(errorCode)) { return; }
defLSR = &builderDefaultLSR;
}
supportedLocalesLength = builder.supportedLocales_ != nullptr ?
builder.supportedLocales_->size() : 0;
if (supportedLocalesLength > 0) {
// Store the supported locales in input order,
// so that when different types are used (e.g., language tag strings)
// we can return those by parallel index.
supportedLocales = static_cast<const Locale **>(
uprv_malloc(supportedLocalesLength * sizeof(const Locale *)));
// Supported LRSs in input order.
// In C++, we store these permanently to simplify ownership management
// in the hash tables. Duplicate LSRs (if any) are unused overhead.
lsrs = new LSR[supportedLocalesLength];
if (supportedLocales == nullptr || lsrs == nullptr) {
errorCode = U_MEMORY_ALLOCATION_ERROR;
return;
}
// If the constructor fails partway, we need null pointers for destructibility.
uprv_memset(supportedLocales, 0, supportedLocalesLength * sizeof(const Locale *));
for (int32_t i = 0; i < supportedLocalesLength; ++i) {
const Locale &locale = *static_cast<Locale *>(builder.supportedLocales_->elementAt(i));
supportedLocales[i] = locale.clone();
if (supportedLocales[i] == nullptr) {
errorCode = U_MEMORY_ALLOCATION_ERROR;
return;
}
const Locale &supportedLocale = *supportedLocales[i];
LSR &lsr = lsrs[i] = getMaximalLsrOrUnd(likelySubtags, supportedLocale, errorCode);
lsr.setHashCode();
if (U_FAILURE(errorCode)) { return; }
}
// We need an unordered map from LSR to first supported locale with that LSR,
// and an ordered list of (LSR, supported index) for
// the supported locales in the following order:
// 1. Default locale, if it is supported.
// 2. Priority locales (aka "paradigm locales") in builder order.
// 3. Remaining locales in builder order.
supportedLsrToIndex = uhash_openSize(hashLSR, compareLSRs, uhash_compareLong,
supportedLocalesLength, &errorCode);
if (U_FAILURE(errorCode)) { return; }
supportedLSRs = static_cast<const LSR **>(
uprv_malloc(supportedLocalesLength * sizeof(const LSR *)));
supportedIndexes = static_cast<int32_t *>(
uprv_malloc(supportedLocalesLength * sizeof(int32_t)));
if (supportedLSRs == nullptr || supportedIndexes == nullptr) {
errorCode = U_MEMORY_ALLOCATION_ERROR;
return;
}
int32_t suppLength = 0;
// Determine insertion order.
// Add locales immediately that are equivalent to the default.
MaybeStackArray<int8_t, 100> order(supportedLocalesLength, errorCode);
if (U_FAILURE(errorCode)) { return; }
int32_t numParadigms = 0;
for (int32_t i = 0; i < supportedLocalesLength; ++i) {
const Locale &locale = *supportedLocales[i];
const LSR &lsr = lsrs[i];
if (defLSR == nullptr && builder.withDefault_) {
// Implicit default locale = first supported locale, if not turned off.
U_ASSERT(i == 0);
def = &locale;
defLSR = &lsr;
order[i] = 1;
suppLength = putIfAbsent(lsr, 0, suppLength, errorCode);
} else if (defLSR != nullptr && lsr.isEquivalentTo(*defLSR)) {
order[i] = 1;
suppLength = putIfAbsent(lsr, i, suppLength, errorCode);
} else if (localeDistance.isParadigmLSR(lsr)) {
order[i] = 2;
++numParadigms;
} else {
order[i] = 3;
}
if (U_FAILURE(errorCode)) { return; }
}
// Add supported paradigm locales.
int32_t paradigmLimit = suppLength + numParadigms;
for (int32_t i = 0; i < supportedLocalesLength && suppLength < paradigmLimit; ++i) {
if (order[i] == 2) {
suppLength = putIfAbsent(lsrs[i], i, suppLength, errorCode);
}
}
// Add remaining supported locales.
for (int32_t i = 0; i < supportedLocalesLength; ++i) {
if (order[i] == 3) {
suppLength = putIfAbsent(lsrs[i], i, suppLength, errorCode);
}
}
supportedLSRsLength = suppLength;
// If supportedLSRsLength < supportedLocalesLength then
// we waste as many array slots as there are duplicate supported LSRs,
// but the amount of wasted space is small as long as there are few duplicates.
}
defaultLocale = def;
if (builder.demotion_ == ULOCMATCH_DEMOTION_REGION) {
demotionPerDesiredLocale = localeDistance.getDefaultDemotionPerDesiredLocale();
}
if (thresholdDistance >= 0) {
// already copied
} else if (builder.maxDistanceDesired_ != nullptr) {
LSR suppLSR = getMaximalLsrOrUnd(likelySubtags, *builder.maxDistanceSupported_, errorCode);
const LSR *pSuppLSR = &suppLSR;
int32_t indexAndDistance = localeDistance.getBestIndexAndDistance(
getMaximalLsrOrUnd(likelySubtags, *builder.maxDistanceDesired_, errorCode),
&pSuppLSR, 1,
LocaleDistance::shiftDistance(100), favorSubtag, direction);
if (U_SUCCESS(errorCode)) {
// +1 for an exclusive threshold from an inclusive max.
thresholdDistance = LocaleDistance::getDistanceFloor(indexAndDistance) + 1;
} else {
thresholdDistance = 0;
}
} else {
thresholdDistance = localeDistance.getDefaultScriptDistance();
}
}
LocaleMatcher::LocaleMatcher(LocaleMatcher &&src) noexcept :
likelySubtags(src.likelySubtags),
localeDistance(src.localeDistance),
thresholdDistance(src.thresholdDistance),
demotionPerDesiredLocale(src.demotionPerDesiredLocale),
favorSubtag(src.favorSubtag),
direction(src.direction),
supportedLocales(src.supportedLocales), lsrs(src.lsrs),
supportedLocalesLength(src.supportedLocalesLength),
supportedLsrToIndex(src.supportedLsrToIndex),
supportedLSRs(src.supportedLSRs),
supportedIndexes(src.supportedIndexes),
supportedLSRsLength(src.supportedLSRsLength),
ownedDefaultLocale(src.ownedDefaultLocale), defaultLocale(src.defaultLocale) {
src.supportedLocales = nullptr;
src.lsrs = nullptr;
src.supportedLocalesLength = 0;
src.supportedLsrToIndex = nullptr;
src.supportedLSRs = nullptr;
src.supportedIndexes = nullptr;
src.supportedLSRsLength = 0;
src.ownedDefaultLocale = nullptr;
src.defaultLocale = nullptr;
}
LocaleMatcher::~LocaleMatcher() {
for (int32_t i = 0; i < supportedLocalesLength; ++i) {
delete supportedLocales[i];
}
uprv_free(supportedLocales);
delete[] lsrs;
uhash_close(supportedLsrToIndex);
uprv_free(supportedLSRs);
uprv_free(supportedIndexes);
delete ownedDefaultLocale;
}
LocaleMatcher &LocaleMatcher::operator=(LocaleMatcher &&src) noexcept {
this->~LocaleMatcher();
thresholdDistance = src.thresholdDistance;
demotionPerDesiredLocale = src.demotionPerDesiredLocale;
favorSubtag = src.favorSubtag;
direction = src.direction;
supportedLocales = src.supportedLocales;
lsrs = src.lsrs;
supportedLocalesLength = src.supportedLocalesLength;
supportedLsrToIndex = src.supportedLsrToIndex;
supportedLSRs = src.supportedLSRs;
supportedIndexes = src.supportedIndexes;
supportedLSRsLength = src.supportedLSRsLength;
ownedDefaultLocale = src.ownedDefaultLocale;
defaultLocale = src.defaultLocale;
src.supportedLocales = nullptr;
src.lsrs = nullptr;
src.supportedLocalesLength = 0;
src.supportedLsrToIndex = nullptr;
src.supportedLSRs = nullptr;
src.supportedIndexes = nullptr;
src.supportedLSRsLength = 0;
src.ownedDefaultLocale = nullptr;
src.defaultLocale = nullptr;
return *this;
}
class LocaleLsrIterator {
public:
LocaleLsrIterator(const LikelySubtags &likelySubtags, Locale::Iterator &locales,
ULocMatchLifetime lifetime) :
likelySubtags(likelySubtags), locales(locales), lifetime(lifetime) {}
~LocaleLsrIterator() {
if (lifetime == ULOCMATCH_TEMPORARY_LOCALES) {
delete remembered;
}
}
bool hasNext() const {
return locales.hasNext();
}
LSR next(UErrorCode &errorCode) {
current = &locales.next();
return getMaximalLsrOrUnd(likelySubtags, *current, errorCode);
}
void rememberCurrent(int32_t desiredIndex, UErrorCode &errorCode) {
if (U_FAILURE(errorCode)) { return; }
bestDesiredIndex = desiredIndex;
if (lifetime == ULOCMATCH_STORED_LOCALES) {
remembered = current;
} else {
// ULOCMATCH_TEMPORARY_LOCALES
delete remembered;
remembered = new Locale(*current);
if (remembered == nullptr) {
errorCode = U_MEMORY_ALLOCATION_ERROR;
}
}
}
const Locale *orphanRemembered() {
const Locale *rem = remembered;
remembered = nullptr;
return rem;
}
int32_t getBestDesiredIndex() const {
return bestDesiredIndex;
}
private:
const LikelySubtags &likelySubtags;
Locale::Iterator &locales;
ULocMatchLifetime lifetime;
const Locale *current = nullptr, *remembered = nullptr;
int32_t bestDesiredIndex = -1;
};
const Locale *LocaleMatcher::getBestMatch(const Locale &desiredLocale, UErrorCode &errorCode) const {
if (U_FAILURE(errorCode)) { return nullptr; }
std::optional<int32_t> suppIndex = getBestSuppIndex(
getMaximalLsrOrUnd(likelySubtags, desiredLocale, errorCode),
nullptr, errorCode);
return U_SUCCESS(errorCode) && suppIndex.has_value() ? supportedLocales[*suppIndex]
: defaultLocale;
}
const Locale *LocaleMatcher::getBestMatch(Locale::Iterator &desiredLocales,
UErrorCode &errorCode) const {
if (U_FAILURE(errorCode)) { return nullptr; }
if (!desiredLocales.hasNext()) {
return defaultLocale;
}
LocaleLsrIterator lsrIter(likelySubtags, desiredLocales, ULOCMATCH_TEMPORARY_LOCALES);
std::optional<int32_t> suppIndex = getBestSuppIndex(lsrIter.next(errorCode), &lsrIter, errorCode);
return U_SUCCESS(errorCode) && suppIndex.has_value() ? supportedLocales[*suppIndex]
: defaultLocale;
}
const Locale *LocaleMatcher::getBestMatchForListString(
StringPiece desiredLocaleList, UErrorCode &errorCode) const {
if (U_FAILURE(errorCode)) { return nullptr; }
LocalePriorityList list(desiredLocaleList, errorCode);
LocalePriorityList::Iterator iter = list.iterator();
return getBestMatch(iter, errorCode);
}
LocaleMatcher::Result LocaleMatcher::getBestMatchResult(
const Locale &desiredLocale, UErrorCode &errorCode) const {
if (U_FAILURE(errorCode)) {
return Result(nullptr, defaultLocale, -1, -1, false);
}
std::optional<int32_t> suppIndex = getBestSuppIndex(
getMaximalLsrOrUnd(likelySubtags, desiredLocale, errorCode),
nullptr, errorCode);
if (U_FAILURE(errorCode) || !suppIndex.has_value()) {
return Result(nullptr, defaultLocale, -1, -1, false);
} else {
return Result(&desiredLocale, supportedLocales[*suppIndex], 0, *suppIndex, false);
}
}
LocaleMatcher::Result LocaleMatcher::getBestMatchResult(
Locale::Iterator &desiredLocales, UErrorCode &errorCode) const {
if (U_FAILURE(errorCode) || !desiredLocales.hasNext()) {
return Result(nullptr, defaultLocale, -1, -1, false);
}
LocaleLsrIterator lsrIter(likelySubtags, desiredLocales, ULOCMATCH_TEMPORARY_LOCALES);
std::optional<int32_t> suppIndex = getBestSuppIndex(lsrIter.next(errorCode), &lsrIter, errorCode);
if (U_FAILURE(errorCode) || !suppIndex.has_value()) {
return Result(nullptr, defaultLocale, -1, -1, false);
} else {
return Result(lsrIter.orphanRemembered(), supportedLocales[*suppIndex],
lsrIter.getBestDesiredIndex(), *suppIndex, true);
}
}
std::optional<int32_t> LocaleMatcher::getBestSuppIndex(LSR desiredLSR,
LocaleLsrIterator *remainingIter,
UErrorCode &errorCode) const {
if (U_FAILURE(errorCode)) { return std::nullopt; }
int32_t desiredIndex = 0;
int32_t bestSupportedLsrIndex = -1;
for (int32_t bestShiftedDistance = LocaleDistance::shiftDistance(thresholdDistance);;) {
// Quick check for exact maximized LSR.
if (supportedLsrToIndex != nullptr) {
desiredLSR.setHashCode();
UBool found = false;
int32_t suppIndex = uhash_getiAndFound(supportedLsrToIndex, &desiredLSR, &found);
if (found) {
if (remainingIter != nullptr) {
remainingIter->rememberCurrent(desiredIndex, errorCode);
}
return suppIndex;
}
}
int32_t bestIndexAndDistance = localeDistance.getBestIndexAndDistance(
desiredLSR, supportedLSRs, supportedLSRsLength,
bestShiftedDistance, favorSubtag, direction);
if (bestIndexAndDistance >= 0) {
bestShiftedDistance = LocaleDistance::getShiftedDistance(bestIndexAndDistance);
if (remainingIter != nullptr) {
remainingIter->rememberCurrent(desiredIndex, errorCode);
if (U_FAILURE(errorCode)) { return std::nullopt; }
}
bestSupportedLsrIndex = LocaleDistance::getIndex(bestIndexAndDistance);
}
if ((bestShiftedDistance -= LocaleDistance::shiftDistance(demotionPerDesiredLocale)) <= 0) {
break;
}
if (remainingIter == nullptr || !remainingIter->hasNext()) {
break;
}
desiredLSR = remainingIter->next(errorCode);
if (U_FAILURE(errorCode)) { return std::nullopt; }
++desiredIndex;
}
if (bestSupportedLsrIndex < 0) {
// no good match
return std::nullopt;
}
return supportedIndexes[bestSupportedLsrIndex];
}
UBool LocaleMatcher::isMatch(const Locale &desired, const Locale &supported,
UErrorCode &errorCode) const {
if (U_FAILURE(errorCode)) { return false; }
LSR suppLSR = getMaximalLsrOrUnd(likelySubtags, supported, errorCode);
if (U_FAILURE(errorCode)) { return false; }
const LSR *pSuppLSR = &suppLSR;
int32_t indexAndDistance = localeDistance.getBestIndexAndDistance(
getMaximalLsrOrUnd(likelySubtags, desired, errorCode),
&pSuppLSR, 1,
LocaleDistance::shiftDistance(thresholdDistance), favorSubtag, direction);
return indexAndDistance >= 0;
}
double LocaleMatcher::internalMatch(const Locale &desired, const Locale &supported, UErrorCode &errorCode) const {
if (U_FAILURE(errorCode)) { return 0.; }
// Returns the inverse of the distance: That is, 1-distance(desired, supported).
LSR suppLSR = getMaximalLsrOrUnd(likelySubtags, supported, errorCode);
if (U_FAILURE(errorCode)) { return 0.; }
const LSR *pSuppLSR = &suppLSR;
int32_t indexAndDistance = localeDistance.getBestIndexAndDistance(
getMaximalLsrOrUnd(likelySubtags, desired, errorCode),
&pSuppLSR, 1,
LocaleDistance::shiftDistance(thresholdDistance), favorSubtag, direction);
double distance = LocaleDistance::getDistanceDouble(indexAndDistance);
return (100.0 - distance) / 100.0;
}
U_NAMESPACE_END
// uloc_acceptLanguage() --------------------------------------------------- ***
U_NAMESPACE_USE
namespace {
class LocaleFromTag {
public:
LocaleFromTag() : locale(Locale::getRoot()) {}
const Locale &operator()(const char *tag) { return locale = Locale(tag); }
private:
// Store the locale in the converter, rather than return a reference to a temporary,
// or a value which could go out of scope with the caller's reference to it.
Locale locale;
};
int32_t acceptLanguage(UEnumeration &supportedLocales, Locale::Iterator &desiredLocales,
char *dest, int32_t capacity, UAcceptResult *acceptResult,
UErrorCode &errorCode) {
if (U_FAILURE(errorCode)) { return 0; }
LocaleMatcher::Builder builder;
const char *locString;
while ((locString = uenum_next(&supportedLocales, nullptr, &errorCode)) != nullptr) {
Locale loc(locString);
if (loc.isBogus()) {
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
builder.addSupportedLocale(loc);
}
LocaleMatcher matcher = builder.build(errorCode);
LocaleMatcher::Result result = matcher.getBestMatchResult(desiredLocales, errorCode);
if (U_FAILURE(errorCode)) { return 0; }
if (result.getDesiredIndex() >= 0) {
if (acceptResult != nullptr) {
*acceptResult = *result.getDesiredLocale() == *result.getSupportedLocale() ?
ULOC_ACCEPT_VALID : ULOC_ACCEPT_FALLBACK;
}
const char *bestStr = result.getSupportedLocale()->getName();
int32_t bestLength = (int32_t)uprv_strlen(bestStr);
if (bestLength <= capacity) {
uprv_memcpy(dest, bestStr, bestLength);
}
return u_terminateChars(dest, capacity, bestLength, &errorCode);
} else {
if (acceptResult != nullptr) {
*acceptResult = ULOC_ACCEPT_FAILED;
}
return u_terminateChars(dest, capacity, 0, &errorCode);
}
}
} // namespace
U_CAPI int32_t U_EXPORT2
uloc_acceptLanguage(char *result, int32_t resultAvailable,
UAcceptResult *outResult,
const char **acceptList, int32_t acceptListCount,
UEnumeration *availableLocales,
UErrorCode *status) {
if (U_FAILURE(*status)) { return 0; }
if ((result == nullptr ? resultAvailable != 0 : resultAvailable < 0) ||
(acceptList == nullptr ? acceptListCount != 0 : acceptListCount < 0) ||
availableLocales == nullptr) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
LocaleFromTag converter;
Locale::ConvertingIterator<const char **, LocaleFromTag> desiredLocales(
acceptList, acceptList + acceptListCount, converter);
return acceptLanguage(*availableLocales, desiredLocales,
result, resultAvailable, outResult, *status);
}
U_CAPI int32_t U_EXPORT2
uloc_acceptLanguageFromHTTP(char *result, int32_t resultAvailable,
UAcceptResult *outResult,
const char *httpAcceptLanguage,
UEnumeration *availableLocales,
UErrorCode *status) {
if (U_FAILURE(*status)) { return 0; }
if ((result == nullptr ? resultAvailable != 0 : resultAvailable < 0) ||
httpAcceptLanguage == nullptr || availableLocales == nullptr) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
LocalePriorityList list(httpAcceptLanguage, *status);
LocalePriorityList::Iterator desiredLocales = list.iterator();
return acceptLanguage(*availableLocales, desiredLocales,
result, resultAvailable, outResult, *status);
}

View file

@ -0,0 +1,240 @@
// © 2019 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
// localeprioritylist.cpp
// created: 2019jul11 Markus W. Scherer
#include "unicode/utypes.h"
#include "unicode/localpointer.h"
#include "unicode/locid.h"
#include "unicode/stringpiece.h"
#include "unicode/uobject.h"
#include "charstr.h"
#include "cmemory.h"
#include "localeprioritylist.h"
#include "uarrsort.h"
#include "uassert.h"
#include "uhash.h"
U_NAMESPACE_BEGIN
namespace {
int32_t hashLocale(const UHashTok token) {
const auto* locale = static_cast<const Locale*>(token.pointer);
return locale->hashCode();
}
UBool compareLocales(const UHashTok t1, const UHashTok t2) {
const auto* l1 = static_cast<const Locale*>(t1.pointer);
const auto* l2 = static_cast<const Locale*>(t2.pointer);
return *l1 == *l2;
}
constexpr int32_t WEIGHT_ONE = 1000;
struct LocaleAndWeight {
Locale *locale;
int32_t weight; // 0..1000 = 0.0..1.0
int32_t index; // force stable sort
int32_t compare(const LocaleAndWeight &other) const {
int32_t diff = other.weight - weight; // descending: other-this
if (diff != 0) { return diff; }
return index - other.index;
}
};
int32_t U_CALLCONV
compareLocaleAndWeight(const void * /*context*/, const void *left, const void *right) {
return static_cast<const LocaleAndWeight *>(left)->
compare(*static_cast<const LocaleAndWeight *>(right));
}
const char *skipSpaces(const char *p, const char *limit) {
while (p < limit && *p == ' ') { ++p; }
return p;
}
int32_t findTagLength(const char *p, const char *limit) {
// Look for accept-language delimiters.
// Leave other validation up to the Locale constructor.
const char *q;
for (q = p; q < limit; ++q) {
char c = *q;
if (c == ' ' || c == ',' || c == ';') { break; }
}
return static_cast<int32_t>(q - p);
}
/**
* Parses and returns a qvalue weight in millis.
* Advances p to after the parsed substring.
* Returns a negative value if parsing fails.
*/
int32_t parseWeight(const char *&p, const char *limit) {
p = skipSpaces(p, limit);
char c;
if (p == limit || ((c = *p) != '0' && c != '1')) { return -1; }
int32_t weight = (c - '0') * 1000;
if (++p == limit || *p != '.') { return weight; }
int32_t multiplier = 100;
while (++p != limit && '0' <= (c = *p) && c <= '9') {
c -= '0';
if (multiplier > 0) {
weight += c * multiplier;
multiplier /= 10;
} else if (multiplier == 0) {
// round up
if (c >= 5) { ++weight; }
multiplier = -1;
} // else ignore further fraction digits
}
return weight <= WEIGHT_ONE ? weight : -1; // bad if > 1.0
}
} // namespace
/**
* Nothing but a wrapper over a MaybeStackArray of LocaleAndWeight.
*
* This wrapper exists (and is not in an anonymous namespace)
* so that we can forward-declare it in the header file and
* don't have to expose the MaybeStackArray specialization and
* the LocaleAndWeight to code (like the test) that #includes localeprioritylist.h.
* Also, otherwise we would have to do a platform-specific
* template export declaration of some kind for the MaybeStackArray specialization
* to be properly exported from the common DLL.
*/
struct LocaleAndWeightArray : public UMemory {
MaybeStackArray<LocaleAndWeight, 20> array;
};
LocalePriorityList::LocalePriorityList(StringPiece s, UErrorCode &errorCode) {
if (U_FAILURE(errorCode)) { return; }
list = new LocaleAndWeightArray();
if (list == nullptr) {
errorCode = U_MEMORY_ALLOCATION_ERROR;
return;
}
const char *p = s.data();
const char *limit = p + s.length();
while ((p = skipSpaces(p, limit)) != limit) {
if (*p == ',') { // empty range field
++p;
continue;
}
int32_t tagLength = findTagLength(p, limit);
if (tagLength == 0) {
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
CharString tag(p, tagLength, errorCode);
if (U_FAILURE(errorCode)) { return; }
Locale locale = Locale(tag.data());
if (locale.isBogus()) {
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
int32_t weight = WEIGHT_ONE;
if ((p = skipSpaces(p + tagLength, limit)) != limit && *p == ';') {
if ((p = skipSpaces(p + 1, limit)) == limit || *p != 'q' ||
(p = skipSpaces(p + 1, limit)) == limit || *p != '=' ||
(++p, (weight = parseWeight(p, limit)) < 0)) {
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
p = skipSpaces(p, limit);
}
if (p != limit && *p != ',') { // trailing junk
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
add(locale, weight, errorCode);
if (p == limit) { break; }
++p;
}
sort(errorCode);
}
LocalePriorityList::~LocalePriorityList() {
if (list != nullptr) {
for (int32_t i = 0; i < listLength; ++i) {
delete list->array[i].locale;
}
delete list;
}
uhash_close(map);
}
const Locale *LocalePriorityList::localeAt(int32_t i) const {
return list->array[i].locale;
}
Locale *LocalePriorityList::orphanLocaleAt(int32_t i) {
if (list == nullptr) { return nullptr; }
LocaleAndWeight &lw = list->array[i];
Locale *l = lw.locale;
lw.locale = nullptr;
return l;
}
bool LocalePriorityList::add(const Locale &locale, int32_t weight, UErrorCode &errorCode) {
if (U_FAILURE(errorCode)) { return false; }
if (map == nullptr) {
if (weight <= 0) { return true; } // do not add q=0
map = uhash_open(hashLocale, compareLocales, uhash_compareLong, &errorCode);
if (U_FAILURE(errorCode)) { return false; }
}
LocalPointer<Locale> clone;
UBool found = false;
int32_t index = uhash_getiAndFound(map, &locale, &found);
if (found) {
// Duplicate: Remove the old item and append it anew.
LocaleAndWeight &lw = list->array[index];
clone.adoptInstead(lw.locale);
lw.locale = nullptr;
lw.weight = 0;
++numRemoved;
}
if (weight <= 0) { // do not add q=0
if (found) {
// Not strictly necessary but cleaner.
uhash_removei(map, &locale);
}
return true;
}
if (clone.isNull()) {
clone.adoptInstead(locale.clone());
if (clone.isNull() || (clone->isBogus() && !locale.isBogus())) {
errorCode = U_MEMORY_ALLOCATION_ERROR;
return false;
}
}
if (listLength == list->array.getCapacity()) {
int32_t newCapacity = listLength < 50 ? 100 : 4 * listLength;
if (list->array.resize(newCapacity, listLength) == nullptr) {
errorCode = U_MEMORY_ALLOCATION_ERROR;
return false;
}
}
uhash_putiAllowZero(map, clone.getAlias(), listLength, &errorCode);
if (U_FAILURE(errorCode)) { return false; }
LocaleAndWeight &lw = list->array[listLength];
lw.locale = clone.orphan();
lw.weight = weight;
lw.index = listLength++;
if (weight < WEIGHT_ONE) { hasWeights = true; }
U_ASSERT(uhash_count(map) == getLength());
return true;
}
void LocalePriorityList::sort(UErrorCode &errorCode) {
// Sort by descending weights if there is a mix of weights.
// The comparator forces a stable sort via the item index.
if (U_FAILURE(errorCode) || getLength() <= 1 || !hasWeights) { return; }
uprv_sortArray(list->array.getAlias(), listLength, sizeof(LocaleAndWeight),
compareLocaleAndWeight, nullptr, false, &errorCode);
}
U_NAMESPACE_END

View file

@ -0,0 +1,115 @@
// © 2019 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
// localeprioritylist.h
// created: 2019jul11 Markus W. Scherer
#ifndef __LOCALEPRIORITYLIST_H__
#define __LOCALEPRIORITYLIST_H__
#include "unicode/utypes.h"
#include "unicode/locid.h"
#include "unicode/stringpiece.h"
#include "unicode/uobject.h"
struct UHashtable;
U_NAMESPACE_BEGIN
struct LocaleAndWeightArray;
/**
* Parses a list of locales from an accept-language string.
* We are a bit more lenient than the spec:
* We accept extra whitespace in more places, empty range fields,
* and any number of qvalue fraction digits.
*
* https://tools.ietf.org/html/rfc2616#section-14.4
* 14.4 Accept-Language
*
* Accept-Language = "Accept-Language" ":"
* 1#( language-range [ ";" "q" "=" qvalue ] )
* language-range = ( ( 1*8ALPHA *( "-" 1*8ALPHA ) ) | "*" )
*
* Each language-range MAY be given an associated quality value which
* represents an estimate of the user's preference for the languages
* specified by that range. The quality value defaults to "q=1". For
* example,
*
* Accept-Language: da, en-gb;q=0.8, en;q=0.7
*
* https://tools.ietf.org/html/rfc2616#section-3.9
* 3.9 Quality Values
*
* HTTP content negotiation (section 12) uses short "floating point"
* numbers to indicate the relative importance ("weight") of various
* negotiable parameters. A weight is normalized to a real number in
* the range 0 through 1, where 0 is the minimum and 1 the maximum
* value. If a parameter has a quality value of 0, then content with
* this parameter is `not acceptable' for the client. HTTP/1.1
* applications MUST NOT generate more than three digits after the
* decimal point. User configuration of these values SHOULD also be
* limited in this fashion.
*
* qvalue = ( "0" [ "." 0*3DIGIT ] )
* | ( "1" [ "." 0*3("0") ] )
*/
class U_COMMON_API LocalePriorityList : public UMemory {
public:
class Iterator : public Locale::Iterator {
public:
UBool hasNext() const override { return count < length; }
const Locale &next() override {
for(;;) {
const Locale *locale = list.localeAt(index++);
if (locale != nullptr) {
++count;
return *locale;
}
}
}
private:
friend class LocalePriorityList;
Iterator(const LocalePriorityList &list) : list(list), length(list.getLength()) {}
const LocalePriorityList &list;
int32_t index = 0;
int32_t count = 0;
const int32_t length;
};
LocalePriorityList(StringPiece s, UErrorCode &errorCode);
~LocalePriorityList();
int32_t getLength() const { return listLength - numRemoved; }
int32_t getLengthIncludingRemoved() const { return listLength; }
Iterator iterator() const { return Iterator(*this); }
const Locale *localeAt(int32_t i) const;
Locale *orphanLocaleAt(int32_t i);
private:
LocalePriorityList(const LocalePriorityList &) = delete;
LocalePriorityList &operator=(const LocalePriorityList &) = delete;
bool add(const Locale &locale, int32_t weight, UErrorCode &errorCode);
void sort(UErrorCode &errorCode);
LocaleAndWeightArray *list = nullptr;
int32_t listLength = 0;
int32_t numRemoved = 0;
bool hasWeights = false; // other than 1.0
UHashtable *map = nullptr;
};
U_NAMESPACE_END
#endif // __LOCALEPRIORITYLIST_H__

View file

@ -0,0 +1,27 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
***************************************************************************
* Copyright (C) 2006 International Business Machines Corporation *
* and others. All rights reserved. *
***************************************************************************
*/
#ifndef LOCALSVC_H
#define LOCALSVC_H
#include "unicode/utypes.h"
#if defined(U_LOCAL_SERVICE_HOOK) && U_LOCAL_SERVICE_HOOK
/**
* Prototype for user-supplied service hook. This function is expected to return
* a type of factory object specific to the requested service.
*
* @param what service-specific string identifying the specific user hook
* @param status error status
* @return a service-specific hook, or NULL on failure.
*/
U_CAPI void* uprv_svc_hook(const char *what, UErrorCode *status);
#endif
#endif

View file

@ -0,0 +1,265 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
*
* Copyright (C) 1997-2013, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: locavailable.cpp
* encoding: UTF-8
* tab size: 8 (not used)
* indentation:4
*
* created on: 2010feb25
* created by: Markus W. Scherer
*
* Code for available locales, separated out from other .cpp files
* that then do not depend on resource bundle code and res_index bundles.
*/
#include "unicode/errorcode.h"
#include "unicode/utypes.h"
#include "unicode/locid.h"
#include "unicode/uloc.h"
#include "unicode/ures.h"
#include "cmemory.h"
#include "cstring.h"
#include "ucln_cmn.h"
#include "uassert.h"
#include "umutex.h"
#include "uresimp.h"
// C++ API ----------------------------------------------------------------- ***
U_NAMESPACE_BEGIN
static icu::Locale* availableLocaleList = nullptr;
static int32_t availableLocaleListCount;
static icu::UInitOnce gInitOnceLocale {};
namespace {
UBool U_CALLCONV locale_available_cleanup()
{
if (availableLocaleList) {
delete []availableLocaleList;
availableLocaleList = nullptr;
}
availableLocaleListCount = 0;
gInitOnceLocale.reset();
return true;
}
} // namespace
void U_CALLCONV locale_available_init() {
// This function is a friend of class Locale.
// This function is only invoked via umtx_initOnce().
// for now, there is a hardcoded list, so just walk through that list and set it up.
// Note: this function is a friend of class Locale.
availableLocaleListCount = uloc_countAvailable();
if(availableLocaleListCount) {
availableLocaleList = new Locale[availableLocaleListCount];
}
if (availableLocaleList == nullptr) {
availableLocaleListCount= 0;
}
for (int32_t locCount=availableLocaleListCount-1; locCount>=0; --locCount) {
availableLocaleList[locCount].setFromPOSIXID(uloc_getAvailable(locCount));
}
ucln_common_registerCleanup(UCLN_COMMON_LOCALE_AVAILABLE, locale_available_cleanup);
}
const Locale* U_EXPORT2
Locale::getAvailableLocales(int32_t& count)
{
umtx_initOnce(gInitOnceLocale, &locale_available_init);
count = availableLocaleListCount;
return availableLocaleList;
}
U_NAMESPACE_END
// C API ------------------------------------------------------------------- ***
U_NAMESPACE_USE
/* ### Constants **************************************************/
namespace {
// Enough capacity for the two lists in the res_index.res file
const char** gAvailableLocaleNames[2] = {};
int32_t gAvailableLocaleCounts[2] = {};
icu::UInitOnce ginstalledLocalesInitOnce {};
class AvailableLocalesSink : public ResourceSink {
public:
void put(const char *key, ResourceValue &value, UBool /*noFallback*/, UErrorCode &status) override {
if (U_FAILURE(status)) { return; }
ResourceTable resIndexTable = value.getTable(status);
if (U_FAILURE(status)) { return; }
for (int32_t i = 0; resIndexTable.getKeyAndValue(i, key, value); ++i) {
ULocAvailableType type;
if (uprv_strcmp(key, "InstalledLocales") == 0) {
type = ULOC_AVAILABLE_DEFAULT;
} else if (uprv_strcmp(key, "AliasLocales") == 0) {
type = ULOC_AVAILABLE_ONLY_LEGACY_ALIASES;
} else {
// CLDRVersion, etc.
continue;
}
ResourceTable availableLocalesTable = value.getTable(status);
if (U_FAILURE(status)) {
return;
}
gAvailableLocaleCounts[type] = availableLocalesTable.getSize();
gAvailableLocaleNames[type] = static_cast<const char**>(
uprv_malloc(gAvailableLocaleCounts[type] * sizeof(const char*)));
if (gAvailableLocaleNames[type] == nullptr) {
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
for (int32_t j = 0; availableLocalesTable.getKeyAndValue(j, key, value); ++j) {
gAvailableLocaleNames[type][j] = key;
}
}
}
};
class AvailableLocalesStringEnumeration : public StringEnumeration {
public:
AvailableLocalesStringEnumeration(ULocAvailableType type) : fType(type) {
}
const char* next(int32_t *resultLength, UErrorCode &status) override {
if (U_FAILURE(status)) { return nullptr; }
ULocAvailableType actualType = fType;
int32_t actualIndex = fIndex++;
// If the "combined" list was requested, resolve that now
if (fType == ULOC_AVAILABLE_WITH_LEGACY_ALIASES) {
int32_t defaultLocalesCount = gAvailableLocaleCounts[ULOC_AVAILABLE_DEFAULT];
if (actualIndex < defaultLocalesCount) {
actualType = ULOC_AVAILABLE_DEFAULT;
} else {
actualIndex -= defaultLocalesCount;
actualType = ULOC_AVAILABLE_ONLY_LEGACY_ALIASES;
}
}
// Return the requested string
int32_t count = gAvailableLocaleCounts[actualType];
const char* result;
if (actualIndex < count) {
result = gAvailableLocaleNames[actualType][actualIndex];
if (resultLength != nullptr) {
*resultLength = static_cast<int32_t>(uprv_strlen(result));
}
} else {
result = nullptr;
if (resultLength != nullptr) {
*resultLength = 0;
}
}
return result;
}
void reset(UErrorCode &status) override {
if (U_FAILURE(status)) { return; }
fIndex = 0;
}
int32_t count(UErrorCode &status) const override {
if (U_FAILURE(status)) { return 0; }
if (fType == ULOC_AVAILABLE_WITH_LEGACY_ALIASES) {
return gAvailableLocaleCounts[ULOC_AVAILABLE_DEFAULT]
+ gAvailableLocaleCounts[ULOC_AVAILABLE_ONLY_LEGACY_ALIASES];
} else {
return gAvailableLocaleCounts[fType];
}
}
private:
ULocAvailableType fType;
int32_t fIndex = 0;
};
/* ### Get available **************************************************/
UBool U_CALLCONV uloc_cleanup() {
for (int32_t i = 0; i < UPRV_LENGTHOF(gAvailableLocaleNames); i++) {
uprv_free(gAvailableLocaleNames[i]);
gAvailableLocaleNames[i] = nullptr;
gAvailableLocaleCounts[i] = 0;
}
ginstalledLocalesInitOnce.reset();
return true;
}
// Load Installed Locales. This function will be called exactly once
// via the initOnce mechanism.
void U_CALLCONV loadInstalledLocales(UErrorCode& status) {
ucln_common_registerCleanup(UCLN_COMMON_ULOC, uloc_cleanup);
icu::LocalUResourceBundlePointer rb(ures_openDirect(nullptr, "res_index", &status));
AvailableLocalesSink sink;
ures_getAllItemsWithFallback(rb.getAlias(), "", sink, status);
}
void _load_installedLocales(UErrorCode& status) {
umtx_initOnce(ginstalledLocalesInitOnce, &loadInstalledLocales, status);
}
} // namespace
U_CAPI const char* U_EXPORT2
uloc_getAvailable(int32_t offset) {
icu::ErrorCode status;
_load_installedLocales(status);
if (status.isFailure()) {
return nullptr;
}
if (offset > gAvailableLocaleCounts[0]) {
// *status = U_ILLEGAL_ARGUMENT_ERROR;
return nullptr;
}
return gAvailableLocaleNames[0][offset];
}
U_CAPI int32_t U_EXPORT2
uloc_countAvailable() {
icu::ErrorCode status;
_load_installedLocales(status);
if (status.isFailure()) {
return 0;
}
return gAvailableLocaleCounts[0];
}
U_CAPI UEnumeration* U_EXPORT2
uloc_openAvailableByType(ULocAvailableType type, UErrorCode* status) {
if (U_FAILURE(*status)) {
return nullptr;
}
if (type < 0 || type >= ULOC_AVAILABLE_COUNT) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return nullptr;
}
_load_installedLocales(*status);
if (U_FAILURE(*status)) {
return nullptr;
}
LocalPointer<AvailableLocalesStringEnumeration> result(
new AvailableLocalesStringEnumeration(type), *status);
if (U_FAILURE(*status)) {
return nullptr;
}
return uenum_openFromStringEnumeration(result.orphan(), status);
}

View file

@ -0,0 +1,55 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
**********************************************************************
* Copyright (c) 2004-2014, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* Author: Alan Liu
* Created: January 16 2004
* Since: ICU 2.8
**********************************************************************
*/
#include "locbased.h"
#include "cstring.h"
U_NAMESPACE_BEGIN
Locale LocaleBased::getLocale(ULocDataLocaleType type, UErrorCode& status) const {
const char* id = getLocaleID(type, status);
return Locale(id != nullptr ? id : "");
}
const char* LocaleBased::getLocaleID(ULocDataLocaleType type, UErrorCode& status) const {
if (U_FAILURE(status)) {
return nullptr;
}
switch(type) {
case ULOC_VALID_LOCALE:
return valid;
case ULOC_ACTUAL_LOCALE:
return actual;
default:
status = U_ILLEGAL_ARGUMENT_ERROR;
return nullptr;
}
}
void LocaleBased::setLocaleIDs(const char* validID, const char* actualID) {
if (validID != nullptr) {
uprv_strncpy(valid, validID, ULOC_FULLNAME_CAPACITY);
valid[ULOC_FULLNAME_CAPACITY-1] = 0; // always terminate
}
if (actualID != nullptr) {
uprv_strncpy(actual, actualID, ULOC_FULLNAME_CAPACITY);
actual[ULOC_FULLNAME_CAPACITY-1] = 0; // always terminate
}
}
void LocaleBased::setLocaleIDs(const Locale& validID, const Locale& actualID) {
uprv_strcpy(valid, validID.getName());
uprv_strcpy(actual, actualID.getName());
}
U_NAMESPACE_END

View file

@ -0,0 +1,107 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
**********************************************************************
* Copyright (c) 2004-2014, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* Author: Alan Liu
* Created: January 16 2004
* Since: ICU 2.8
**********************************************************************
*/
#ifndef LOCBASED_H
#define LOCBASED_H
#include "unicode/locid.h"
#include "unicode/uobject.h"
/**
* Macro to declare a locale LocaleBased wrapper object for the given
* object, which must have two members named `validLocale' and
* `actualLocale' of size ULOC_FULLNAME_CAPACITY
*/
#define U_LOCALE_BASED(varname, objname) \
LocaleBased varname((objname).validLocale, (objname).actualLocale)
U_NAMESPACE_BEGIN
/**
* A utility class that unifies the implementation of getLocale() by
* various ICU services. This class is likely to be removed in the
* ICU 3.0 time frame in favor of an integrated approach with the
* services framework.
* @since ICU 2.8
*/
class U_COMMON_API LocaleBased : public UMemory {
public:
/**
* Construct a LocaleBased wrapper around the two pointers. These
* will be aliased for the lifetime of this object.
*/
inline LocaleBased(char* validAlias, char* actualAlias);
/**
* Construct a LocaleBased wrapper around the two const pointers.
* These will be aliased for the lifetime of this object.
*/
inline LocaleBased(const char* validAlias, const char* actualAlias);
/**
* Return locale meta-data for the service object wrapped by this
* object. Either the valid or the actual locale may be
* retrieved.
* @param type either ULOC_VALID_LOCALE or ULOC_ACTUAL_LOCALE
* @param status input-output error code
* @return the indicated locale
*/
Locale getLocale(ULocDataLocaleType type, UErrorCode& status) const;
/**
* Return the locale ID for the service object wrapped by this
* object. Either the valid or the actual locale may be
* retrieved.
* @param type either ULOC_VALID_LOCALE or ULOC_ACTUAL_LOCALE
* @param status input-output error code
* @return the indicated locale ID
*/
const char* getLocaleID(ULocDataLocaleType type, UErrorCode& status) const;
/**
* Set the locale meta-data for the service object wrapped by this
* object. If either parameter is zero, it is ignored.
* @param valid the ID of the valid locale
* @param actual the ID of the actual locale
*/
void setLocaleIDs(const char* valid, const char* actual);
/**
* Set the locale meta-data for the service object wrapped by this
* object.
* @param valid the ID of the valid locale
* @param actual the ID of the actual locale
*/
void setLocaleIDs(const Locale& valid, const Locale& actual);
private:
char* valid;
char* actual;
};
inline LocaleBased::LocaleBased(char* validAlias, char* actualAlias) :
valid(validAlias), actual(actualAlias) {
}
inline LocaleBased::LocaleBased(const char* validAlias,
const char* actualAlias) :
// ugh: cast away const
valid((char*)validAlias), actual((char*)actualAlias) {
}
U_NAMESPACE_END
#endif

View file

@ -0,0 +1,897 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
*
* Copyright (C) 1997-2016, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: locdispnames.cpp
* encoding: UTF-8
* tab size: 8 (not used)
* indentation:4
*
* created on: 2010feb25
* created by: Markus W. Scherer
*
* Code for locale display names, separated out from other .cpp files
* that then do not depend on resource bundle code and display name data.
*/
#include "unicode/utypes.h"
#include "unicode/brkiter.h"
#include "unicode/locid.h"
#include "unicode/uenum.h"
#include "unicode/uloc.h"
#include "unicode/ures.h"
#include "unicode/ustring.h"
#include "charstr.h"
#include "cmemory.h"
#include "cstring.h"
#include "putilimp.h"
#include "ulocimp.h"
#include "uresimp.h"
#include "ureslocs.h"
#include "ustr_imp.h"
// C++ API ----------------------------------------------------------------- ***
U_NAMESPACE_BEGIN
UnicodeString&
Locale::getDisplayLanguage(UnicodeString& dispLang) const
{
return this->getDisplayLanguage(getDefault(), dispLang);
}
/*We cannot make any assumptions on the size of the output display strings
* Yet, since we are calling through to a C API, we need to set limits on
* buffer size. For all the following getDisplay functions we first attempt
* to fill up a stack allocated buffer. If it is to small we heap allocated
* the exact buffer we need copy it to the UnicodeString and delete it*/
UnicodeString&
Locale::getDisplayLanguage(const Locale &displayLocale,
UnicodeString &result) const {
char16_t *buffer;
UErrorCode errorCode=U_ZERO_ERROR;
int32_t length;
buffer=result.getBuffer(ULOC_FULLNAME_CAPACITY);
if (buffer == nullptr) {
result.truncate(0);
return result;
}
length=uloc_getDisplayLanguage(fullName, displayLocale.fullName,
buffer, result.getCapacity(),
&errorCode);
result.releaseBuffer(U_SUCCESS(errorCode) ? length : 0);
if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
buffer=result.getBuffer(length);
if (buffer == nullptr) {
result.truncate(0);
return result;
}
errorCode=U_ZERO_ERROR;
length=uloc_getDisplayLanguage(fullName, displayLocale.fullName,
buffer, result.getCapacity(),
&errorCode);
result.releaseBuffer(U_SUCCESS(errorCode) ? length : 0);
}
return result;
}
UnicodeString&
Locale::getDisplayScript(UnicodeString& dispScript) const
{
return this->getDisplayScript(getDefault(), dispScript);
}
UnicodeString&
Locale::getDisplayScript(const Locale &displayLocale,
UnicodeString &result) const {
char16_t *buffer;
UErrorCode errorCode=U_ZERO_ERROR;
int32_t length;
buffer=result.getBuffer(ULOC_FULLNAME_CAPACITY);
if (buffer == nullptr) {
result.truncate(0);
return result;
}
length=uloc_getDisplayScript(fullName, displayLocale.fullName,
buffer, result.getCapacity(),
&errorCode);
result.releaseBuffer(U_SUCCESS(errorCode) ? length : 0);
if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
buffer=result.getBuffer(length);
if (buffer == nullptr) {
result.truncate(0);
return result;
}
errorCode=U_ZERO_ERROR;
length=uloc_getDisplayScript(fullName, displayLocale.fullName,
buffer, result.getCapacity(),
&errorCode);
result.releaseBuffer(U_SUCCESS(errorCode) ? length : 0);
}
return result;
}
UnicodeString&
Locale::getDisplayCountry(UnicodeString& dispCntry) const
{
return this->getDisplayCountry(getDefault(), dispCntry);
}
UnicodeString&
Locale::getDisplayCountry(const Locale &displayLocale,
UnicodeString &result) const {
char16_t *buffer;
UErrorCode errorCode=U_ZERO_ERROR;
int32_t length;
buffer=result.getBuffer(ULOC_FULLNAME_CAPACITY);
if (buffer == nullptr) {
result.truncate(0);
return result;
}
length=uloc_getDisplayCountry(fullName, displayLocale.fullName,
buffer, result.getCapacity(),
&errorCode);
result.releaseBuffer(U_SUCCESS(errorCode) ? length : 0);
if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
buffer=result.getBuffer(length);
if (buffer == nullptr) {
result.truncate(0);
return result;
}
errorCode=U_ZERO_ERROR;
length=uloc_getDisplayCountry(fullName, displayLocale.fullName,
buffer, result.getCapacity(),
&errorCode);
result.releaseBuffer(U_SUCCESS(errorCode) ? length : 0);
}
return result;
}
UnicodeString&
Locale::getDisplayVariant(UnicodeString& dispVar) const
{
return this->getDisplayVariant(getDefault(), dispVar);
}
UnicodeString&
Locale::getDisplayVariant(const Locale &displayLocale,
UnicodeString &result) const {
char16_t *buffer;
UErrorCode errorCode=U_ZERO_ERROR;
int32_t length;
buffer=result.getBuffer(ULOC_FULLNAME_CAPACITY);
if (buffer == nullptr) {
result.truncate(0);
return result;
}
length=uloc_getDisplayVariant(fullName, displayLocale.fullName,
buffer, result.getCapacity(),
&errorCode);
result.releaseBuffer(U_SUCCESS(errorCode) ? length : 0);
if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
buffer=result.getBuffer(length);
if (buffer == nullptr) {
result.truncate(0);
return result;
}
errorCode=U_ZERO_ERROR;
length=uloc_getDisplayVariant(fullName, displayLocale.fullName,
buffer, result.getCapacity(),
&errorCode);
result.releaseBuffer(U_SUCCESS(errorCode) ? length : 0);
}
return result;
}
UnicodeString&
Locale::getDisplayName( UnicodeString& name ) const
{
return this->getDisplayName(getDefault(), name);
}
UnicodeString&
Locale::getDisplayName(const Locale &displayLocale,
UnicodeString &result) const {
char16_t *buffer;
UErrorCode errorCode=U_ZERO_ERROR;
int32_t length;
buffer=result.getBuffer(ULOC_FULLNAME_CAPACITY);
if (buffer == nullptr) {
result.truncate(0);
return result;
}
length=uloc_getDisplayName(fullName, displayLocale.fullName,
buffer, result.getCapacity(),
&errorCode);
result.releaseBuffer(U_SUCCESS(errorCode) ? length : 0);
if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
buffer=result.getBuffer(length);
if (buffer == nullptr) {
result.truncate(0);
return result;
}
errorCode=U_ZERO_ERROR;
length=uloc_getDisplayName(fullName, displayLocale.fullName,
buffer, result.getCapacity(),
&errorCode);
result.releaseBuffer(U_SUCCESS(errorCode) ? length : 0);
}
return result;
}
#if ! UCONFIG_NO_BREAK_ITERATION
// -------------------------------------
// Gets the objectLocale display name in the default locale language.
UnicodeString& U_EXPORT2
BreakIterator::getDisplayName(const Locale& objectLocale,
UnicodeString& name)
{
return objectLocale.getDisplayName(name);
}
// -------------------------------------
// Gets the objectLocale display name in the displayLocale language.
UnicodeString& U_EXPORT2
BreakIterator::getDisplayName(const Locale& objectLocale,
const Locale& displayLocale,
UnicodeString& name)
{
return objectLocale.getDisplayName(displayLocale, name);
}
#endif
U_NAMESPACE_END
// C API ------------------------------------------------------------------- ***
U_NAMESPACE_USE
namespace {
/* ### Constants **************************************************/
/* These strings describe the resources we attempt to load from
the locale ResourceBundle data file.*/
constexpr char _kLanguages[] = "Languages";
constexpr char _kScripts[] = "Scripts";
constexpr char _kScriptsStandAlone[] = "Scripts%stand-alone";
constexpr char _kCountries[] = "Countries";
constexpr char _kVariants[] = "Variants";
constexpr char _kKeys[] = "Keys";
constexpr char _kTypes[] = "Types";
//constexpr char _kRootName[] = "root";
constexpr char _kCurrency[] = "currency";
constexpr char _kCurrencies[] = "Currencies";
constexpr char _kLocaleDisplayPattern[] = "localeDisplayPattern";
constexpr char _kPattern[] = "pattern";
constexpr char _kSeparator[] = "separator";
/* ### Display name **************************************************/
int32_t
_getStringOrCopyKey(const char *path, const char *locale,
const char *tableKey,
const char* subTableKey,
const char *itemKey,
const char *substitute,
char16_t *dest, int32_t destCapacity,
UErrorCode &errorCode) {
if (U_FAILURE(errorCode)) { return 0; }
const char16_t *s = nullptr;
int32_t length = 0;
if(itemKey==nullptr) {
/* top-level item: normal resource bundle access */
icu::LocalUResourceBundlePointer rb(ures_open(path, locale, &errorCode));
if(U_SUCCESS(errorCode)) {
s=ures_getStringByKey(rb.getAlias(), tableKey, &length, &errorCode);
/* see comment about closing rb near "return item;" in _res_getTableStringWithFallback() */
}
} else {
bool isLanguageCode = (uprv_strncmp(tableKey, _kLanguages, 9) == 0);
/* Language code should not be a number. If it is, set the error code. */
if (isLanguageCode && uprv_strtol(itemKey, nullptr, 10)) {
errorCode = U_MISSING_RESOURCE_ERROR;
} else {
/* second-level item, use special fallback */
s=uloc_getTableStringWithFallback(path, locale,
tableKey,
subTableKey,
itemKey,
&length,
&errorCode);
if (U_FAILURE(errorCode) && isLanguageCode && itemKey != nullptr) {
// convert itemKey locale code to canonical form and try again, ICU-20870
errorCode = U_ZERO_ERROR;
Locale canonKey = Locale::createCanonical(itemKey);
s=uloc_getTableStringWithFallback(path, locale,
tableKey,
subTableKey,
canonKey.getName(),
&length,
&errorCode);
}
}
}
if(U_SUCCESS(errorCode)) {
int32_t copyLength=uprv_min(length, destCapacity);
if(copyLength>0 && s != nullptr) {
u_memcpy(dest, s, copyLength);
}
} else {
/* no string from a resource bundle: convert the substitute */
length=(int32_t)uprv_strlen(substitute);
u_charsToUChars(substitute, dest, uprv_min(length, destCapacity));
errorCode = U_USING_DEFAULT_WARNING;
}
return u_terminateUChars(dest, destCapacity, length, &errorCode);
}
using UDisplayNameGetter = icu::CharString(const char*, UErrorCode&);
int32_t
_getDisplayNameForComponent(const char *locale,
const char *displayLocale,
char16_t *dest, int32_t destCapacity,
UDisplayNameGetter *getter,
const char *tag,
UErrorCode &errorCode) {
if (U_FAILURE(errorCode)) { return 0; }
UErrorCode localStatus;
const char* root = nullptr;
if(destCapacity<0 || (destCapacity>0 && dest==nullptr)) {
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
localStatus = U_ZERO_ERROR;
icu::CharString localeBuffer = (*getter)(locale, localStatus);
if (U_FAILURE(localStatus)) {
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
if (localeBuffer.isEmpty()) {
// For the display name, we treat this as unknown language (ICU-20273).
if (getter == ulocimp_getLanguage) {
localeBuffer.append("und", errorCode);
} else {
return u_terminateUChars(dest, destCapacity, 0, &errorCode);
}
}
root = tag == _kCountries ? U_ICUDATA_REGION : U_ICUDATA_LANG;
return _getStringOrCopyKey(root, displayLocale,
tag, nullptr, localeBuffer.data(),
localeBuffer.data(),
dest, destCapacity,
errorCode);
}
} // namespace
U_CAPI int32_t U_EXPORT2
uloc_getDisplayLanguage(const char *locale,
const char *displayLocale,
char16_t *dest, int32_t destCapacity,
UErrorCode *pErrorCode) {
return _getDisplayNameForComponent(locale, displayLocale, dest, destCapacity,
ulocimp_getLanguage, _kLanguages, *pErrorCode);
}
U_CAPI int32_t U_EXPORT2
uloc_getDisplayScript(const char* locale,
const char* displayLocale,
char16_t *dest, int32_t destCapacity,
UErrorCode *pErrorCode)
{
if (U_FAILURE(*pErrorCode)) { return 0; }
UErrorCode err = U_ZERO_ERROR;
int32_t res = _getDisplayNameForComponent(locale, displayLocale, dest, destCapacity,
ulocimp_getScript, _kScriptsStandAlone, err);
if (destCapacity == 0 && err == U_BUFFER_OVERFLOW_ERROR) {
// For preflight, return the max of the value and the fallback.
int32_t fallback_res = _getDisplayNameForComponent(locale, displayLocale, dest, destCapacity,
ulocimp_getScript, _kScripts, *pErrorCode);
return (fallback_res > res) ? fallback_res : res;
}
if ( err == U_USING_DEFAULT_WARNING ) {
return _getDisplayNameForComponent(locale, displayLocale, dest, destCapacity,
ulocimp_getScript, _kScripts, *pErrorCode);
} else {
*pErrorCode = err;
return res;
}
}
static int32_t
uloc_getDisplayScriptInContext(const char* locale,
const char* displayLocale,
char16_t *dest, int32_t destCapacity,
UErrorCode *pErrorCode)
{
return _getDisplayNameForComponent(locale, displayLocale, dest, destCapacity,
ulocimp_getScript, _kScripts, *pErrorCode);
}
U_CAPI int32_t U_EXPORT2
uloc_getDisplayCountry(const char *locale,
const char *displayLocale,
char16_t *dest, int32_t destCapacity,
UErrorCode *pErrorCode) {
return _getDisplayNameForComponent(locale, displayLocale, dest, destCapacity,
ulocimp_getRegion, _kCountries, *pErrorCode);
}
/*
* TODO separate variant1_variant2_variant3...
* by getting each tag's display string and concatenating them with ", "
* in between - similar to uloc_getDisplayName()
*/
U_CAPI int32_t U_EXPORT2
uloc_getDisplayVariant(const char *locale,
const char *displayLocale,
char16_t *dest, int32_t destCapacity,
UErrorCode *pErrorCode) {
return _getDisplayNameForComponent(locale, displayLocale, dest, destCapacity,
ulocimp_getVariant, _kVariants, *pErrorCode);
}
/* Instead of having a separate pass for 'special' patterns, reintegrate the two
* so we don't get bitten by preflight bugs again. We can be reasonably efficient
* without two separate code paths, this code isn't that performance-critical.
*
* This code is general enough to deal with patterns that have a prefix or swap the
* language and remainder components, since we gave developers enough rope to do such
* things if they futz with the pattern data. But since we don't give them a way to
* specify a pattern for arbitrary combinations of components, there's not much use in
* that. I don't think our data includes such patterns, the only variable I know if is
* whether there is a space before the open paren, or not. Oh, and zh uses different
* chars than the standard open/close paren (which ja and ko use, btw).
*/
U_CAPI int32_t U_EXPORT2
uloc_getDisplayName(const char *locale,
const char *displayLocale,
char16_t *dest, int32_t destCapacity,
UErrorCode *pErrorCode)
{
static const char16_t defaultSeparator[9] = { 0x007b, 0x0030, 0x007d, 0x002c, 0x0020, 0x007b, 0x0031, 0x007d, 0x0000 }; /* "{0}, {1}" */
static const char16_t sub0[4] = { 0x007b, 0x0030, 0x007d , 0x0000 } ; /* {0} */
static const char16_t sub1[4] = { 0x007b, 0x0031, 0x007d , 0x0000 } ; /* {1} */
static const int32_t subLen = 3;
static const char16_t defaultPattern[10] = {
0x007b, 0x0030, 0x007d, 0x0020, 0x0028, 0x007b, 0x0031, 0x007d, 0x0029, 0x0000
}; /* {0} ({1}) */
static const int32_t defaultPatLen = 9;
static const int32_t defaultSub0Pos = 0;
static const int32_t defaultSub1Pos = 5;
int32_t length; /* of formatted result */
const char16_t *separator;
int32_t sepLen = 0;
const char16_t *pattern;
int32_t patLen = 0;
int32_t sub0Pos, sub1Pos;
char16_t formatOpenParen = 0x0028; // (
char16_t formatReplaceOpenParen = 0x005B; // [
char16_t formatCloseParen = 0x0029; // )
char16_t formatReplaceCloseParen = 0x005D; // ]
UBool haveLang = true; /* assume true, set false if we find we don't have
a lang component in the locale */
UBool haveRest = true; /* assume true, set false if we find we don't have
any other component in the locale */
UBool retry = false; /* set true if we need to retry, see below */
int32_t langi = 0; /* index of the language substitution (0 or 1), virtually always 0 */
if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) {
return 0;
}
if(destCapacity<0 || (destCapacity>0 && dest==nullptr)) {
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
{
UErrorCode status = U_ZERO_ERROR;
icu::LocalUResourceBundlePointer locbundle(
ures_open(U_ICUDATA_LANG, displayLocale, &status));
icu::LocalUResourceBundlePointer dspbundle(
ures_getByKeyWithFallback(locbundle.getAlias(), _kLocaleDisplayPattern, nullptr, &status));
separator=ures_getStringByKeyWithFallback(dspbundle.getAlias(), _kSeparator, &sepLen, &status);
pattern=ures_getStringByKeyWithFallback(dspbundle.getAlias(), _kPattern, &patLen, &status);
}
/* If we couldn't find any data, then use the defaults */
if(sepLen == 0) {
separator = defaultSeparator;
}
/* #10244: Even though separator is now a pattern, it is awkward to handle it as such
* here since we are trying to build the display string in place in the dest buffer,
* and to handle it as a pattern would entail having separate storage for the
* substrings that need to be combined (the first of which may be the result of
* previous such combinations). So for now we continue to treat the portion between
* {0} and {1} as a string to be appended when joining substrings, ignoring anything
* that is before {0} or after {1} (no existing separator pattern has any such thing).
* This is similar to how pattern is handled below.
*/
{
char16_t *p0=u_strstr(separator, sub0);
char16_t *p1=u_strstr(separator, sub1);
if (p0==nullptr || p1==nullptr || p1<p0) {
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
separator = (const char16_t *)p0 + subLen;
sepLen = static_cast<int32_t>(p1 - separator);
}
if(patLen==0 || (patLen==defaultPatLen && !u_strncmp(pattern, defaultPattern, patLen))) {
pattern=defaultPattern;
patLen=defaultPatLen;
sub0Pos=defaultSub0Pos;
sub1Pos=defaultSub1Pos;
// use default formatOpenParen etc. set above
} else { /* non-default pattern */
char16_t *p0=u_strstr(pattern, sub0);
char16_t *p1=u_strstr(pattern, sub1);
if (p0==nullptr || p1==nullptr) {
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
sub0Pos = static_cast<int32_t>(p0-pattern);
sub1Pos = static_cast<int32_t>(p1-pattern);
if (sub1Pos < sub0Pos) { /* a very odd pattern */
int32_t t=sub0Pos; sub0Pos=sub1Pos; sub1Pos=t;
langi=1;
}
if (u_strchr(pattern, 0xFF08) != nullptr) {
formatOpenParen = 0xFF08; // fullwidth (
formatReplaceOpenParen = 0xFF3B; // fullwidth [
formatCloseParen = 0xFF09; // fullwidth )
formatReplaceCloseParen = 0xFF3D; // fullwidth ]
}
}
/* We loop here because there is one case in which after the first pass we could need to
* reextract the data. If there's initial padding before the first element, we put in
* the padding and then write that element. If it turns out there's no second element,
* we didn't need the padding. If we do need the data (no preflight), and the first element
* would have fit but for the padding, we need to reextract. In this case (only) we
* adjust the parameters so padding is not added, and repeat.
*/
do {
char16_t* p=dest;
int32_t patPos=0; /* position in the pattern, used for non-substitution portions */
int32_t langLen=0; /* length of language substitution */
int32_t langPos=0; /* position in output of language substitution */
int32_t restLen=0; /* length of 'everything else' substitution */
int32_t restPos=0; /* position in output of 'everything else' substitution */
icu::LocalUEnumerationPointer kenum; /* keyword enumeration */
/* prefix of pattern, extremely likely to be empty */
if(sub0Pos) {
if(destCapacity >= sub0Pos) {
while (patPos < sub0Pos) {
*p++ = pattern[patPos++];
}
} else {
patPos=sub0Pos;
}
length=sub0Pos;
} else {
length=0;
}
for(int32_t subi=0,resti=0;subi<2;) { /* iterate through patterns 0 and 1*/
UBool subdone = false; /* set true when ready to move to next substitution */
/* prep p and cap for calls to get display components, pin cap to 0 since
they complain if cap is negative */
int32_t cap=destCapacity-length;
if (cap <= 0) {
cap=0;
} else {
p=dest+length;
}
if (subi == langi) { /* {0}*/
if(haveLang) {
langPos=length;
langLen=uloc_getDisplayLanguage(locale, displayLocale, p, cap, pErrorCode);
length+=langLen;
haveLang=langLen>0;
}
subdone=true;
} else { /* {1} */
if(!haveRest) {
subdone=true;
} else {
int32_t len; /* length of component (plus other stuff) we just fetched */
switch(resti++) {
case 0:
restPos=length;
len=uloc_getDisplayScriptInContext(locale, displayLocale, p, cap, pErrorCode);
break;
case 1:
len=uloc_getDisplayCountry(locale, displayLocale, p, cap, pErrorCode);
break;
case 2:
len=uloc_getDisplayVariant(locale, displayLocale, p, cap, pErrorCode);
break;
case 3:
kenum.adoptInstead(uloc_openKeywords(locale, pErrorCode));
U_FALLTHROUGH;
default: {
const char* kw=uenum_next(kenum.getAlias(), &len, pErrorCode);
if (kw == nullptr) {
len=0; /* mark that we didn't add a component */
subdone=true;
} else {
/* incorporating this behavior into the loop made it even more complex,
so just special case it here */
len = uloc_getDisplayKeyword(kw, displayLocale, p, cap, pErrorCode);
if(len) {
if(len < cap) {
p[len]=0x3d; /* '=', assume we'll need it */
}
len+=1;
/* adjust for call to get keyword */
cap-=len;
if(cap <= 0) {
cap=0;
} else {
p+=len;
}
}
/* reset for call below */
if(*pErrorCode == U_BUFFER_OVERFLOW_ERROR) {
*pErrorCode=U_ZERO_ERROR;
}
int32_t vlen = uloc_getDisplayKeywordValue(locale, kw, displayLocale,
p, cap, pErrorCode);
if(len) {
if(vlen==0) {
--len; /* remove unneeded '=' */
}
/* restore cap and p to what they were at start */
cap=destCapacity-length;
if(cap <= 0) {
cap=0;
} else {
p=dest+length;
}
}
len+=vlen; /* total we added for key + '=' + value */
}
} break;
} /* end switch */
if (len>0) {
/* we added a component, so add separator and write it if there's room. */
if(len+sepLen<=cap) {
const char16_t * plimit = p + len;
for (; p < plimit; p++) {
if (*p == formatOpenParen) {
*p = formatReplaceOpenParen;
} else if (*p == formatCloseParen) {
*p = formatReplaceCloseParen;
}
}
for(int32_t i=0;i<sepLen;++i) {
*p++=separator[i];
}
}
length+=len+sepLen;
} else if(subdone) {
/* remove separator if we added it */
if (length!=restPos) {
length-=sepLen;
}
restLen=length-restPos;
haveRest=restLen>0;
}
}
}
if(*pErrorCode == U_BUFFER_OVERFLOW_ERROR) {
*pErrorCode=U_ZERO_ERROR;
}
if(subdone) {
if(haveLang && haveRest) {
/* append internal portion of pattern, the first time,
or last portion of pattern the second time */
int32_t padLen;
patPos+=subLen;
padLen=(subi==0 ? sub1Pos : patLen)-patPos;
if(length+padLen <= destCapacity) {
p=dest+length;
for(int32_t i=0;i<padLen;++i) {
*p++=pattern[patPos++];
}
} else {
patPos+=padLen;
}
length+=padLen;
} else if(subi==0) {
/* don't have first component, reset for second component */
sub0Pos=0;
length=0;
} else if(length>0) {
/* true length is the length of just the component we got. */
length=haveLang?langLen:restLen;
if(dest && sub0Pos!=0) {
if (sub0Pos+length<=destCapacity) {
/* first component not at start of result,
but we have full component in buffer. */
u_memmove(dest, dest+(haveLang?langPos:restPos), length);
} else {
/* would have fit, but didn't because of pattern prefix. */
sub0Pos=0; /* stops initial padding (and a second retry,
so we won't end up here again) */
retry=true;
}
}
}
++subi; /* move on to next substitution */
}
}
} while(retry);
return u_terminateUChars(dest, destCapacity, length, pErrorCode);
}
U_CAPI int32_t U_EXPORT2
uloc_getDisplayKeyword(const char* keyword,
const char* displayLocale,
char16_t* dest,
int32_t destCapacity,
UErrorCode* status){
/* argument checking */
if(status==nullptr || U_FAILURE(*status)) {
return 0;
}
if(destCapacity<0 || (destCapacity>0 && dest==nullptr)) {
*status=U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
/* pass itemKey=nullptr to look for a top-level item */
return _getStringOrCopyKey(U_ICUDATA_LANG, displayLocale,
_kKeys, nullptr,
keyword,
keyword,
dest, destCapacity,
*status);
}
#define UCURRENCY_DISPLAY_NAME_INDEX 1
U_CAPI int32_t U_EXPORT2
uloc_getDisplayKeywordValue( const char* locale,
const char* keyword,
const char* displayLocale,
char16_t* dest,
int32_t destCapacity,
UErrorCode* status){
/* argument checking */
if(status==nullptr || U_FAILURE(*status)) {
return 0;
}
if(destCapacity<0 || (destCapacity>0 && dest==nullptr)) {
*status=U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
/* get the keyword value */
CharString keywordValue = ulocimp_getKeywordValue(locale, keyword, *status);
/*
* if the keyword is equal to currency .. then to get the display name
* we need to do the fallback ourselves
*/
if(uprv_stricmp(keyword, _kCurrency)==0){
int32_t dispNameLen = 0;
const char16_t *dispName = nullptr;
icu::LocalUResourceBundlePointer bundle(
ures_open(U_ICUDATA_CURR, displayLocale, status));
icu::LocalUResourceBundlePointer currencies(
ures_getByKey(bundle.getAlias(), _kCurrencies, nullptr, status));
icu::LocalUResourceBundlePointer currency(
ures_getByKeyWithFallback(currencies.getAlias(), keywordValue.data(), nullptr, status));
dispName = ures_getStringByIndex(currency.getAlias(), UCURRENCY_DISPLAY_NAME_INDEX, &dispNameLen, status);
if(U_FAILURE(*status)){
if(*status == U_MISSING_RESOURCE_ERROR){
/* we just want to write the value over if nothing is available */
*status = U_USING_DEFAULT_WARNING;
}else{
return 0;
}
}
/* now copy the dispName over if not nullptr */
if(dispName != nullptr){
if(dispNameLen <= destCapacity){
u_memcpy(dest, dispName, dispNameLen);
return u_terminateUChars(dest, destCapacity, dispNameLen, status);
}else{
*status = U_BUFFER_OVERFLOW_ERROR;
return dispNameLen;
}
}else{
/* we have not found the display name for the value .. just copy over */
if(keywordValue.length() <= destCapacity){
u_charsToUChars(keywordValue.data(), dest, keywordValue.length());
return u_terminateUChars(dest, destCapacity, keywordValue.length(), status);
}else{
*status = U_BUFFER_OVERFLOW_ERROR;
return keywordValue.length();
}
}
}else{
return _getStringOrCopyKey(U_ICUDATA_LANG, displayLocale,
_kTypes, keyword,
keywordValue.data(),
keywordValue.data(),
dest, destCapacity,
*status);
}
}

View file

@ -0,0 +1,415 @@
// © 2019 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
// locdistance.cpp
// created: 2019may08 Markus W. Scherer
#include "unicode/utypes.h"
#include "unicode/bytestrie.h"
#include "unicode/localematcher.h"
#include "unicode/locid.h"
#include "unicode/uobject.h"
#include "unicode/ures.h"
#include "cstring.h"
#include "locdistance.h"
#include "loclikelysubtags.h"
#include "uassert.h"
#include "ucln_cmn.h"
#include "uinvchar.h"
#include "umutex.h"
U_NAMESPACE_BEGIN
namespace {
/**
* Bit flag used on the last character of a subtag in the trie.
* Must be set consistently by the builder and the lookup code.
*/
constexpr int32_t END_OF_SUBTAG = 0x80;
/** Distance value bit flag, set by the builder. */
constexpr int32_t DISTANCE_SKIP_SCRIPT = 0x80;
/** Distance value bit flag, set by trieNext(). */
constexpr int32_t DISTANCE_IS_FINAL = 0x100;
constexpr int32_t DISTANCE_IS_FINAL_OR_SKIP_SCRIPT = DISTANCE_IS_FINAL | DISTANCE_SKIP_SCRIPT;
constexpr int32_t ABOVE_THRESHOLD = 100;
// Indexes into array of distances.
enum {
IX_DEF_LANG_DISTANCE,
IX_DEF_SCRIPT_DISTANCE,
IX_DEF_REGION_DISTANCE,
IX_MIN_REGION_DISTANCE,
IX_LIMIT
};
LocaleDistance *gLocaleDistance = nullptr;
UInitOnce gInitOnce {};
UBool U_CALLCONV cleanup() {
delete gLocaleDistance;
gLocaleDistance = nullptr;
gInitOnce.reset();
return true;
}
} // namespace
void U_CALLCONV LocaleDistance::initLocaleDistance(UErrorCode &errorCode) {
// This function is invoked only via umtx_initOnce().
U_ASSERT(gLocaleDistance == nullptr);
const LikelySubtags &likely = *LikelySubtags::getSingleton(errorCode);
if (U_FAILURE(errorCode)) { return; }
const LocaleDistanceData &data = likely.getDistanceData();
if (data.distanceTrieBytes == nullptr ||
data.regionToPartitions == nullptr || data.partitions == nullptr ||
// ok if no paradigms
data.distances == nullptr) {
errorCode = U_MISSING_RESOURCE_ERROR;
return;
}
gLocaleDistance = new LocaleDistance(data, likely);
if (gLocaleDistance == nullptr) {
errorCode = U_MEMORY_ALLOCATION_ERROR;
return;
}
ucln_common_registerCleanup(UCLN_COMMON_LOCALE_DISTANCE, cleanup);
}
const LocaleDistance *LocaleDistance::getSingleton(UErrorCode &errorCode) {
if (U_FAILURE(errorCode)) { return nullptr; }
umtx_initOnce(gInitOnce, &LocaleDistance::initLocaleDistance, errorCode);
return gLocaleDistance;
}
LocaleDistance::LocaleDistance(const LocaleDistanceData &data, const LikelySubtags &likely) :
likelySubtags(likely),
trie(data.distanceTrieBytes),
regionToPartitionsIndex(data.regionToPartitions), partitionArrays(data.partitions),
paradigmLSRs(data.paradigms), paradigmLSRsLength(data.paradigmsLength),
defaultLanguageDistance(data.distances[IX_DEF_LANG_DISTANCE]),
defaultScriptDistance(data.distances[IX_DEF_SCRIPT_DISTANCE]),
defaultRegionDistance(data.distances[IX_DEF_REGION_DISTANCE]),
minRegionDistance(data.distances[IX_MIN_REGION_DISTANCE]) {
// For the default demotion value, use the
// default region distance between unrelated Englishes.
// Thus, unless demotion is turned off,
// a mere region difference for one desired locale
// is as good as a perfect match for the next following desired locale.
// As of CLDR 36, we have <languageMatch desired="en_*_*" supported="en_*_*" distance="5"/>.
LSR en("en", "Latn", "US", LSR::EXPLICIT_LSR);
LSR enGB("en", "Latn", "GB", LSR::EXPLICIT_LSR);
const LSR *p_enGB = &enGB;
int32_t indexAndDistance = getBestIndexAndDistance(en, &p_enGB, 1,
shiftDistance(50), ULOCMATCH_FAVOR_LANGUAGE, ULOCMATCH_DIRECTION_WITH_ONE_WAY);
defaultDemotionPerDesiredLocale = getDistanceFloor(indexAndDistance);
}
int32_t LocaleDistance::getBestIndexAndDistance(
const LSR &desired,
const LSR **supportedLSRs, int32_t supportedLSRsLength,
int32_t shiftedThreshold,
ULocMatchFavorSubtag favorSubtag, ULocMatchDirection direction) const {
BytesTrie iter(trie);
// Look up the desired language only once for all supported LSRs.
// Its "distance" is either a match point value of 0, or a non-match negative value.
// Note: The data builder verifies that there are no <*, supported> or <desired, *> rules.
int32_t desLangDistance = trieNext(iter, desired.language, false);
uint64_t desLangState = desLangDistance >= 0 && supportedLSRsLength > 1 ? iter.getState64() : 0;
// Index of the supported LSR with the lowest distance.
int32_t bestIndex = -1;
// Cached lookup info from LikelySubtags.compareLikely().
int32_t bestLikelyInfo = -1;
for (int32_t slIndex = 0; slIndex < supportedLSRsLength; ++slIndex) {
const LSR &supported = *supportedLSRs[slIndex];
bool star = false;
int32_t distance = desLangDistance;
if (distance >= 0) {
U_ASSERT((distance & DISTANCE_IS_FINAL) == 0);
if (slIndex != 0) {
iter.resetToState64(desLangState);
}
distance = trieNext(iter, supported.language, true);
}
// Note: The data builder verifies that there are no rules with "any" (*) language and
// real (non *) script or region subtags.
// This means that if the lookup for either language fails we can use
// the default distances without further lookups.
int32_t flags;
if (distance >= 0) {
flags = distance & DISTANCE_IS_FINAL_OR_SKIP_SCRIPT;
distance &= ~DISTANCE_IS_FINAL_OR_SKIP_SCRIPT;
} else { // <*, *>
if (uprv_strcmp(desired.language, supported.language) == 0) {
distance = 0;
} else {
distance = defaultLanguageDistance;
}
flags = 0;
star = true;
}
U_ASSERT(0 <= distance && distance <= 100);
// Round up the shifted threshold (if fraction bits are not 0)
// for comparison with un-shifted distances until we need fraction bits.
// (If we simply shifted non-zero fraction bits away, then we might ignore a language
// when it's really still a micro distance below the threshold.)
int32_t roundedThreshold = (shiftedThreshold + DISTANCE_FRACTION_MASK) >> DISTANCE_SHIFT;
// We implement "favor subtag" by reducing the language subtag distance
// (unscientifically reducing it to a quarter of the normal value),
// so that the script distance is relatively more important.
// For example, given a default language distance of 80, we reduce it to 20,
// which is below the default threshold of 50, which is the default script distance.
if (favorSubtag == ULOCMATCH_FAVOR_SCRIPT) {
distance >>= 2;
}
// Let distance == roundedThreshold pass until the tie-breaker logic
// at the end of the loop.
if (distance > roundedThreshold) {
continue;
}
int32_t scriptDistance;
if (star || flags != 0) {
if (uprv_strcmp(desired.script, supported.script) == 0) {
scriptDistance = 0;
} else {
scriptDistance = defaultScriptDistance;
}
} else {
scriptDistance = getDesSuppScriptDistance(iter, iter.getState64(),
desired.script, supported.script);
flags = scriptDistance & DISTANCE_IS_FINAL;
scriptDistance &= ~DISTANCE_IS_FINAL;
}
distance += scriptDistance;
if (distance > roundedThreshold) {
continue;
}
if (uprv_strcmp(desired.region, supported.region) == 0) {
// regionDistance = 0
} else if (star || (flags & DISTANCE_IS_FINAL) != 0) {
distance += defaultRegionDistance;
} else {
int32_t remainingThreshold = roundedThreshold - distance;
if (minRegionDistance > remainingThreshold) {
continue;
}
// From here on we know the regions are not equal.
// Map each region to zero or more partitions. (zero = one non-matching string)
// (Each array of single-character partition strings is encoded as one string.)
// If either side has more than one, then we find the maximum distance.
// This could be optimized by adding some more structure, but probably not worth it.
distance += getRegionPartitionsDistance(
iter, iter.getState64(),
partitionsForRegion(desired),
partitionsForRegion(supported),
remainingThreshold);
}
int32_t shiftedDistance = shiftDistance(distance);
if (shiftedDistance == 0) {
// Distinguish between equivalent but originally unequal locales via an
// additional micro distance.
shiftedDistance |= (desired.flags ^ supported.flags);
if (shiftedDistance < shiftedThreshold) {
if (direction != ULOCMATCH_DIRECTION_ONLY_TWO_WAY ||
// Is there also a match when we swap desired/supported?
isMatch(supported, desired, shiftedThreshold, favorSubtag)) {
if (shiftedDistance == 0) {
return slIndex << INDEX_SHIFT;
}
bestIndex = slIndex;
shiftedThreshold = shiftedDistance;
bestLikelyInfo = -1;
}
}
} else {
if (shiftedDistance < shiftedThreshold) {
if (direction != ULOCMATCH_DIRECTION_ONLY_TWO_WAY ||
// Is there also a match when we swap desired/supported?
isMatch(supported, desired, shiftedThreshold, favorSubtag)) {
bestIndex = slIndex;
shiftedThreshold = shiftedDistance;
bestLikelyInfo = -1;
}
} else if (shiftedDistance == shiftedThreshold && bestIndex >= 0) {
if (direction != ULOCMATCH_DIRECTION_ONLY_TWO_WAY ||
// Is there also a match when we swap desired/supported?
isMatch(supported, desired, shiftedThreshold, favorSubtag)) {
bestLikelyInfo = likelySubtags.compareLikely(
supported, *supportedLSRs[bestIndex], bestLikelyInfo);
if ((bestLikelyInfo & 1) != 0) {
// This supported locale matches as well as the previous best match,
// and neither matches perfectly,
// but this one is "more likely" (has more-default subtags).
bestIndex = slIndex;
}
}
}
}
}
return bestIndex >= 0 ?
(bestIndex << INDEX_SHIFT) | shiftedThreshold :
INDEX_NEG_1 | shiftDistance(ABOVE_THRESHOLD);
}
int32_t LocaleDistance::getDesSuppScriptDistance(
BytesTrie &iter, uint64_t startState, const char *desired, const char *supported) {
// Note: The data builder verifies that there are no <*, supported> or <desired, *> rules.
int32_t distance = trieNext(iter, desired, false);
if (distance >= 0) {
distance = trieNext(iter, supported, true);
}
if (distance < 0) {
UStringTrieResult result = iter.resetToState64(startState).next(u'*'); // <*, *>
U_ASSERT(USTRINGTRIE_HAS_VALUE(result));
if (uprv_strcmp(desired, supported) == 0) {
distance = 0; // same script
} else {
distance = iter.getValue();
U_ASSERT(distance >= 0);
}
if (result == USTRINGTRIE_FINAL_VALUE) {
distance |= DISTANCE_IS_FINAL;
}
}
return distance;
}
int32_t LocaleDistance::getRegionPartitionsDistance(
BytesTrie &iter, uint64_t startState,
const char *desiredPartitions, const char *supportedPartitions, int32_t threshold) {
char desired = *desiredPartitions++;
char supported = *supportedPartitions++;
U_ASSERT(desired != 0 && supported != 0);
// See if we have single desired/supported partitions, from NUL-terminated
// partition strings without explicit length.
bool suppLengthGt1 = *supportedPartitions != 0; // gt1: more than 1 character
// equivalent to: if (desLength == 1 && suppLength == 1)
if (*desiredPartitions == 0 && !suppLengthGt1) {
// Fastpath for single desired/supported partitions.
UStringTrieResult result = iter.next(uprv_invCharToAscii(desired) | END_OF_SUBTAG);
if (USTRINGTRIE_HAS_NEXT(result)) {
result = iter.next(uprv_invCharToAscii(supported) | END_OF_SUBTAG);
if (USTRINGTRIE_HAS_VALUE(result)) {
return iter.getValue();
}
}
return getFallbackRegionDistance(iter, startState);
}
const char *supportedStart = supportedPartitions - 1; // for restart of inner loop
int32_t regionDistance = 0;
// Fall back to * only once, not for each pair of partition strings.
bool star = false;
for (;;) {
// Look up each desired-partition string only once,
// not for each (desired, supported) pair.
UStringTrieResult result = iter.next(uprv_invCharToAscii(desired) | END_OF_SUBTAG);
if (USTRINGTRIE_HAS_NEXT(result)) {
uint64_t desState = suppLengthGt1 ? iter.getState64() : 0;
for (;;) {
result = iter.next(uprv_invCharToAscii(supported) | END_OF_SUBTAG);
int32_t d;
if (USTRINGTRIE_HAS_VALUE(result)) {
d = iter.getValue();
} else if (star) {
d = 0;
} else {
d = getFallbackRegionDistance(iter, startState);
star = true;
}
if (d > threshold) {
return d;
} else if (regionDistance < d) {
regionDistance = d;
}
if ((supported = *supportedPartitions++) != 0) {
iter.resetToState64(desState);
} else {
break;
}
}
} else if (!star) {
int32_t d = getFallbackRegionDistance(iter, startState);
if (d > threshold) {
return d;
} else if (regionDistance < d) {
regionDistance = d;
}
star = true;
}
if ((desired = *desiredPartitions++) != 0) {
iter.resetToState64(startState);
supportedPartitions = supportedStart;
supported = *supportedPartitions++;
} else {
break;
}
}
return regionDistance;
}
int32_t LocaleDistance::getFallbackRegionDistance(BytesTrie &iter, uint64_t startState) {
#if U_DEBUG
UStringTrieResult result =
#endif
iter.resetToState64(startState).next(u'*'); // <*, *>
U_ASSERT(USTRINGTRIE_HAS_VALUE(result));
int32_t distance = iter.getValue();
U_ASSERT(distance >= 0);
return distance;
}
int32_t LocaleDistance::trieNext(BytesTrie &iter, const char *s, bool wantValue) {
uint8_t c;
if ((c = *s) == 0) {
return -1; // no empty subtags in the distance data
}
for (;;) {
c = uprv_invCharToAscii(c);
// EBCDIC: If *s is not an invariant character,
// then c is now 0 and will simply not match anything, which is harmless.
uint8_t next = *++s;
if (next != 0) {
if (!USTRINGTRIE_HAS_NEXT(iter.next(c))) {
return -1;
}
} else {
// last character of this subtag
UStringTrieResult result = iter.next(c | END_OF_SUBTAG);
if (wantValue) {
if (USTRINGTRIE_HAS_VALUE(result)) {
int32_t value = iter.getValue();
if (result == USTRINGTRIE_FINAL_VALUE) {
value |= DISTANCE_IS_FINAL;
}
return value;
}
} else {
if (USTRINGTRIE_HAS_NEXT(result)) {
return 0;
}
}
return -1;
}
c = next;
}
}
bool LocaleDistance::isParadigmLSR(const LSR &lsr) const {
// Linear search for a very short list (length 6 as of 2019),
// because we look for equivalence not equality, and
// because it's easy.
// If there are many paradigm LSRs we should use a hash set
// with custom comparator and hasher.
U_ASSERT(paradigmLSRsLength <= 15);
for (int32_t i = 0; i < paradigmLSRsLength; ++i) {
if (lsr.isEquivalentTo(paradigmLSRs[i])) { return true; }
}
return false;
}
U_NAMESPACE_END

View file

@ -0,0 +1,151 @@
// © 2019 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
// locdistance.h
// created: 2019may08 Markus W. Scherer
#ifndef __LOCDISTANCE_H__
#define __LOCDISTANCE_H__
#include "unicode/utypes.h"
#include "unicode/bytestrie.h"
#include "unicode/localematcher.h"
#include "unicode/locid.h"
#include "unicode/uobject.h"
#include "lsr.h"
U_NAMESPACE_BEGIN
struct LocaleDistanceData;
/**
* Offline-built data for LocaleMatcher.
* Mostly but not only the data for mapping locales to their maximized forms.
*/
class LocaleDistance final : public UMemory {
public:
static const LocaleDistance *getSingleton(UErrorCode &errorCode);
static int32_t shiftDistance(int32_t distance) {
return distance << DISTANCE_SHIFT;
}
static int32_t getShiftedDistance(int32_t indexAndDistance) {
return indexAndDistance & DISTANCE_MASK;
}
static double getDistanceDouble(int32_t indexAndDistance) {
double shiftedDistance = getShiftedDistance(indexAndDistance);
return shiftedDistance / (1 << DISTANCE_SHIFT);
}
static int32_t getDistanceFloor(int32_t indexAndDistance) {
return (indexAndDistance & DISTANCE_MASK) >> DISTANCE_SHIFT;
}
static int32_t getIndex(int32_t indexAndDistance) {
// assert indexAndDistance >= 0;
return indexAndDistance >> INDEX_SHIFT;
}
/**
* Finds the supported LSR with the smallest distance from the desired one.
* Equivalent LSR subtags must be normalized into a canonical form.
*
* <p>Returns the index of the lowest-distance supported LSR in the high bits
* (negative if none has a distance below the threshold),
* and its distance (0..ABOVE_THRESHOLD) in the low bits.
*/
int32_t getBestIndexAndDistance(const LSR &desired,
const LSR **supportedLSRs, int32_t supportedLSRsLength,
int32_t shiftedThreshold,
ULocMatchFavorSubtag favorSubtag,
ULocMatchDirection direction) const;
bool isParadigmLSR(const LSR &lsr) const;
int32_t getDefaultScriptDistance() const {
return defaultScriptDistance;
}
int32_t getDefaultDemotionPerDesiredLocale() const {
return defaultDemotionPerDesiredLocale;
}
private:
// The distance is shifted left to gain some fraction bits.
static constexpr int32_t DISTANCE_SHIFT = 3;
static constexpr int32_t DISTANCE_FRACTION_MASK = 7;
// 7 bits for 0..100
static constexpr int32_t DISTANCE_INT_SHIFT = 7;
static constexpr int32_t INDEX_SHIFT = DISTANCE_INT_SHIFT + DISTANCE_SHIFT;
static constexpr int32_t DISTANCE_MASK = 0x3ff;
// tic constexpr int32_t MAX_INDEX = 0x1fffff; // avoids sign bit
static constexpr int32_t INDEX_NEG_1 = 0xfffffc00;
LocaleDistance(const LocaleDistanceData &data, const LikelySubtags &likely);
LocaleDistance(const LocaleDistance &other) = delete;
LocaleDistance &operator=(const LocaleDistance &other) = delete;
static void initLocaleDistance(UErrorCode &errorCode);
bool isMatch(const LSR &desired, const LSR &supported,
int32_t shiftedThreshold, ULocMatchFavorSubtag favorSubtag) const {
const LSR *pSupp = &supported;
return getBestIndexAndDistance(
desired, &pSupp, 1,
shiftedThreshold, favorSubtag, ULOCMATCH_DIRECTION_WITH_ONE_WAY) >= 0;
}
static int32_t getDesSuppScriptDistance(BytesTrie &iter, uint64_t startState,
const char *desired, const char *supported);
static int32_t getRegionPartitionsDistance(
BytesTrie &iter, uint64_t startState,
const char *desiredPartitions, const char *supportedPartitions,
int32_t threshold);
static int32_t getFallbackRegionDistance(BytesTrie &iter, uint64_t startState);
static int32_t trieNext(BytesTrie &iter, const char *s, bool wantValue);
const char *partitionsForRegion(const LSR &lsr) const {
// ill-formed region -> one non-matching string
int32_t pIndex = regionToPartitionsIndex[lsr.regionIndex];
return partitionArrays[pIndex];
}
int32_t getDefaultRegionDistance() const {
return defaultRegionDistance;
}
const LikelySubtags &likelySubtags;
// The trie maps each dlang+slang+dscript+sscript+dregion+sregion
// (encoded in ASCII with bit 7 set on the last character of each subtag) to a distance.
// There is also a trie value for each subsequence of whole subtags.
// One '*' is used for a (desired, supported) pair of "und", "Zzzz"/"", or "ZZ"/"".
BytesTrie trie;
/**
* Maps each region to zero or more single-character partitions.
*/
const uint8_t *regionToPartitionsIndex;
const char **partitionArrays;
/**
* Used to get the paradigm region for a cluster, if there is one.
*/
const LSR *paradigmLSRs;
int32_t paradigmLSRsLength;
int32_t defaultLanguageDistance;
int32_t defaultScriptDistance;
int32_t defaultRegionDistance;
int32_t minRegionDistance;
int32_t defaultDemotionPerDesiredLocale;
};
U_NAMESPACE_END
#endif // __LOCDISTANCE_H__

File diff suppressed because it is too large Load diff

2742
engine/thirdparty/icu4c/common/locid.cpp vendored Normal file

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,437 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
*
* Copyright (C) 1997-2016, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: loclikely.cpp
* encoding: UTF-8
* tab size: 8 (not used)
* indentation:4
*
* created on: 2010feb25
* created by: Markus W. Scherer
*
* Code for likely and minimized locale subtags, separated out from other .cpp files
* that then do not depend on resource bundle code and likely-subtags data.
*/
#include <utility>
#include "unicode/bytestream.h"
#include "unicode/utypes.h"
#include "unicode/locid.h"
#include "unicode/putil.h"
#include "unicode/uchar.h"
#include "unicode/uloc.h"
#include "unicode/ures.h"
#include "unicode/uscript.h"
#include "bytesinkutil.h"
#include "charstr.h"
#include "cmemory.h"
#include "cstring.h"
#include "loclikelysubtags.h"
#include "ulocimp.h"
namespace {
/**
* Create a tag string from the supplied parameters. The lang, script and region
* parameters may be nullptr pointers. If they are, their corresponding length parameters
* must be less than or equal to 0.
*
* If an illegal argument is provided, the function returns the error
* U_ILLEGAL_ARGUMENT_ERROR.
*
* @param lang The language tag to use.
* @param langLength The length of the language tag.
* @param script The script tag to use.
* @param scriptLength The length of the script tag.
* @param region The region tag to use.
* @param regionLength The length of the region tag.
* @param variant The region tag to use.
* @param variantLength The length of the region tag.
* @param trailing Any trailing data to append to the new tag.
* @param trailingLength The length of the trailing data.
* @param sink The output sink receiving the tag string.
* @param err A pointer to a UErrorCode for error reporting.
**/
void U_CALLCONV
createTagStringWithAlternates(
const char* lang,
int32_t langLength,
const char* script,
int32_t scriptLength,
const char* region,
int32_t regionLength,
const char* variant,
int32_t variantLength,
const char* trailing,
int32_t trailingLength,
icu::ByteSink& sink,
UErrorCode& err) {
if (U_FAILURE(err)) {
return;
}
if (langLength >= ULOC_LANG_CAPACITY ||
scriptLength >= ULOC_SCRIPT_CAPACITY ||
regionLength >= ULOC_COUNTRY_CAPACITY) {
err = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
if (langLength > 0) {
sink.Append(lang, langLength);
}
if (scriptLength > 0) {
sink.Append("_", 1);
sink.Append(script, scriptLength);
}
if (regionLength > 0) {
sink.Append("_", 1);
sink.Append(region, regionLength);
}
if (variantLength > 0) {
if (regionLength == 0) {
/* extra separator is required */
sink.Append("_", 1);
}
sink.Append("_", 1);
sink.Append(variant, variantLength);
}
if (trailingLength > 0) {
/*
* Copy the trailing data into the supplied buffer.
*/
sink.Append(trailing, trailingLength);
}
}
bool CHECK_TRAILING_VARIANT_SIZE(const char* variant, int32_t variantLength) {
int32_t count = 0;
for (int32_t i = 0; i < variantLength; i++) {
if (_isIDSeparator(variant[i])) {
count = 0;
} else if (count == 8) {
return false;
} else {
count++;
}
}
return true;
}
void
_uloc_addLikelySubtags(const char* localeID,
icu::ByteSink& sink,
UErrorCode& err) {
if (U_FAILURE(err)) {
return;
}
if (localeID == nullptr) {
err = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
icu::CharString lang;
icu::CharString script;
icu::CharString region;
icu::CharString variant;
const char* trailing = nullptr;
ulocimp_getSubtags(localeID, &lang, &script, &region, &variant, &trailing, err);
if (U_FAILURE(err)) {
return;
}
if (!CHECK_TRAILING_VARIANT_SIZE(variant.data(), variant.length())) {
err = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
if (lang.length() == 4) {
if (script.isEmpty()) {
script = std::move(lang);
lang.clear();
} else {
err = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
} else if (lang.length() > 8) {
err = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
int32_t trailingLength = (int32_t)uprv_strlen(trailing);
const icu::LikelySubtags* likelySubtags = icu::LikelySubtags::getSingleton(err);
if (U_FAILURE(err)) {
return;
}
// We need to keep l on the stack because lsr may point into internal
// memory of l.
icu::Locale l = icu::Locale::createFromName(localeID);
if (l.isBogus()) {
err = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
icu::LSR lsr = likelySubtags->makeMaximizedLsrFrom(l, true, err);
if (U_FAILURE(err)) {
return;
}
const char* language = lsr.language;
if (uprv_strcmp(language, "und") == 0) {
language = "";
}
createTagStringWithAlternates(
language,
(int32_t)uprv_strlen(language),
lsr.script,
(int32_t)uprv_strlen(lsr.script),
lsr.region,
(int32_t)uprv_strlen(lsr.region),
variant.data(),
variant.length(),
trailing,
trailingLength,
sink,
err);
}
void
_uloc_minimizeSubtags(const char* localeID,
icu::ByteSink& sink,
bool favorScript,
UErrorCode& err) {
if (U_FAILURE(err)) {
return;
}
if (localeID == nullptr) {
err = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
icu::CharString lang;
icu::CharString script;
icu::CharString region;
icu::CharString variant;
const char* trailing = nullptr;
ulocimp_getSubtags(localeID, &lang, &script, &region, &variant, &trailing, err);
if (U_FAILURE(err)) {
return;
}
if (!CHECK_TRAILING_VARIANT_SIZE(variant.data(), variant.length())) {
err = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
int32_t trailingLength = (int32_t)uprv_strlen(trailing);
const icu::LikelySubtags* likelySubtags = icu::LikelySubtags::getSingleton(err);
if (U_FAILURE(err)) {
return;
}
icu::LSR lsr = likelySubtags->minimizeSubtags(
lang.toStringPiece(),
script.toStringPiece(),
region.toStringPiece(),
favorScript,
err);
if (U_FAILURE(err)) {
return;
}
const char* language = lsr.language;
if (uprv_strcmp(language, "und") == 0) {
language = "";
}
createTagStringWithAlternates(
language,
(int32_t)uprv_strlen(language),
lsr.script,
(int32_t)uprv_strlen(lsr.script),
lsr.region,
(int32_t)uprv_strlen(lsr.region),
variant.data(),
variant.length(),
trailing,
trailingLength,
sink,
err);
}
} // namespace
U_CAPI int32_t U_EXPORT2
uloc_addLikelySubtags(const char* localeID,
char* maximizedLocaleID,
int32_t maximizedLocaleIDCapacity,
UErrorCode* status) {
return icu::ByteSinkUtil::viaByteSinkToTerminatedChars(
maximizedLocaleID, maximizedLocaleIDCapacity,
[&](icu::ByteSink& sink, UErrorCode& status) {
ulocimp_addLikelySubtags(localeID, sink, status);
},
*status);
}
U_EXPORT icu::CharString
ulocimp_addLikelySubtags(const char* localeID,
UErrorCode& status) {
return icu::ByteSinkUtil::viaByteSinkToCharString(
[&](icu::ByteSink& sink, UErrorCode& status) {
ulocimp_addLikelySubtags(localeID, sink, status);
},
status);
}
U_EXPORT void
ulocimp_addLikelySubtags(const char* localeID,
icu::ByteSink& sink,
UErrorCode& status) {
if (U_FAILURE(status)) { return; }
icu::CharString localeBuffer = ulocimp_canonicalize(localeID, status);
_uloc_addLikelySubtags(localeBuffer.data(), sink, status);
}
U_CAPI int32_t U_EXPORT2
uloc_minimizeSubtags(const char* localeID,
char* minimizedLocaleID,
int32_t minimizedLocaleIDCapacity,
UErrorCode* status) {
return icu::ByteSinkUtil::viaByteSinkToTerminatedChars(
minimizedLocaleID, minimizedLocaleIDCapacity,
[&](icu::ByteSink& sink, UErrorCode& status) {
ulocimp_minimizeSubtags(localeID, sink, false, status);
},
*status);
}
U_EXPORT icu::CharString
ulocimp_minimizeSubtags(const char* localeID,
bool favorScript,
UErrorCode& status) {
return icu::ByteSinkUtil::viaByteSinkToCharString(
[&](icu::ByteSink& sink, UErrorCode& status) {
ulocimp_minimizeSubtags(localeID, sink, favorScript, status);
},
status);
}
U_EXPORT void
ulocimp_minimizeSubtags(const char* localeID,
icu::ByteSink& sink,
bool favorScript,
UErrorCode& status) {
if (U_FAILURE(status)) { return; }
icu::CharString localeBuffer = ulocimp_canonicalize(localeID, status);
_uloc_minimizeSubtags(localeBuffer.data(), sink, favorScript, status);
}
// Pairs of (language subtag, + or -) for finding out fast if common languages
// are LTR (minus) or RTL (plus).
static const char LANG_DIR_STRING[] =
"root-en-es-pt-zh-ja-ko-de-fr-it-ar+he+fa+ru-nl-pl-th-tr-";
// Implemented here because this calls ulocimp_addLikelySubtags().
U_CAPI UBool U_EXPORT2
uloc_isRightToLeft(const char *locale) {
UErrorCode errorCode = U_ZERO_ERROR;
icu::CharString lang;
icu::CharString script;
ulocimp_getSubtags(locale, &lang, &script, nullptr, nullptr, nullptr, errorCode);
if (U_FAILURE(errorCode) || script.isEmpty()) {
// Fastpath: We know the likely scripts and their writing direction
// for some common languages.
if (!lang.isEmpty()) {
const char* langPtr = uprv_strstr(LANG_DIR_STRING, lang.data());
if (langPtr != nullptr) {
switch (langPtr[lang.length()]) {
case '-': return false;
case '+': return true;
default: break; // partial match of a longer code
}
}
}
// Otherwise, find the likely script.
errorCode = U_ZERO_ERROR;
icu::CharString likely = ulocimp_addLikelySubtags(locale, errorCode);
if (U_FAILURE(errorCode)) {
return false;
}
ulocimp_getSubtags(likely.data(), nullptr, &script, nullptr, nullptr, nullptr, errorCode);
if (U_FAILURE(errorCode) || script.isEmpty()) {
return false;
}
}
UScriptCode scriptCode = (UScriptCode)u_getPropertyValueEnum(UCHAR_SCRIPT, script.data());
return uscript_isRightToLeft(scriptCode);
}
U_NAMESPACE_BEGIN
UBool
Locale::isRightToLeft() const {
return uloc_isRightToLeft(getBaseName());
}
U_NAMESPACE_END
namespace {
icu::CharString
GetRegionFromKey(const char* localeID, const char* key, UErrorCode& status) {
icu::CharString result;
// First check for keyword value
icu::CharString kw = ulocimp_getKeywordValue(localeID, key, status);
int32_t len = kw.length();
if (U_SUCCESS(status) && len >= 3 && len <= 7) {
// chop off the subdivision code (which will generally be "zzzz" anyway)
const char* const data = kw.data();
if (uprv_isASCIILetter(data[0])) {
result.append(uprv_toupper(data[0]), status);
result.append(uprv_toupper(data[1]), status);
} else {
// assume three-digit region code
result.append(data, 3, status);
}
}
return result;
}
} // namespace
U_EXPORT icu::CharString
ulocimp_getRegionForSupplementalData(const char *localeID, bool inferRegion,
UErrorCode& status) {
if (U_FAILURE(status)) {
return {};
}
icu::CharString rgBuf = GetRegionFromKey(localeID, "rg", status);
if (U_SUCCESS(status) && rgBuf.isEmpty()) {
// No valid rg keyword value, try for unicode_region_subtag
rgBuf = ulocimp_getRegion(localeID, status);
if (U_SUCCESS(status) && rgBuf.isEmpty() && inferRegion) {
// Second check for sd keyword value
rgBuf = GetRegionFromKey(localeID, "sd", status);
if (U_SUCCESS(status) && rgBuf.isEmpty()) {
// no unicode_region_subtag but inferRegion true, try likely subtags
UErrorCode rgStatus = U_ZERO_ERROR;
icu::CharString locBuf = ulocimp_addLikelySubtags(localeID, rgStatus);
if (U_SUCCESS(rgStatus)) {
rgBuf = ulocimp_getRegion(locBuf.data(), status);
}
}
}
}
return rgBuf;
}

View file

@ -0,0 +1,976 @@
// © 2019 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
// loclikelysubtags.cpp
// created: 2019may08 Markus W. Scherer
#include <utility>
#include "unicode/utypes.h"
#include "unicode/bytestrie.h"
#include "unicode/localpointer.h"
#include "unicode/locid.h"
#include "unicode/uobject.h"
#include "unicode/ures.h"
#include "unicode/uscript.h"
#include "charstr.h"
#include "cstring.h"
#include "loclikelysubtags.h"
#include "lsr.h"
#include "uassert.h"
#include "ucln_cmn.h"
#include "uhash.h"
#include "uinvchar.h"
#include "umutex.h"
#include "uniquecharstr.h"
#include "uresdata.h"
#include "uresimp.h"
#include "uvector.h"
U_NAMESPACE_BEGIN
namespace {
constexpr char PSEUDO_ACCENTS_PREFIX = '\''; // -XA, -PSACCENT
constexpr char PSEUDO_BIDI_PREFIX = '+'; // -XB, -PSBIDI
constexpr char PSEUDO_CRACKED_PREFIX = ','; // -XC, -PSCRACK
} // namespace
LocaleDistanceData::LocaleDistanceData(LocaleDistanceData &&data) :
distanceTrieBytes(data.distanceTrieBytes),
regionToPartitions(data.regionToPartitions),
partitions(data.partitions),
paradigms(data.paradigms), paradigmsLength(data.paradigmsLength),
distances(data.distances) {
data.partitions = nullptr;
data.paradigms = nullptr;
}
LocaleDistanceData::~LocaleDistanceData() {
uprv_free(partitions);
delete[] paradigms;
}
struct LikelySubtagsData {
UResourceBundle *langInfoBundle = nullptr;
UniqueCharStrings strings;
CharStringMap languageAliases;
CharStringMap regionAliases;
const uint8_t *trieBytes = nullptr;
LSR *lsrs = nullptr;
int32_t lsrsLength = 0;
LocaleDistanceData distanceData;
LikelySubtagsData(UErrorCode &errorCode) : strings(errorCode) {}
~LikelySubtagsData() {
ures_close(langInfoBundle);
delete[] lsrs;
}
void load(UErrorCode &errorCode) {
if (U_FAILURE(errorCode)) { return; }
langInfoBundle = ures_openDirect(nullptr, "langInfo", &errorCode);
if (U_FAILURE(errorCode)) { return; }
StackUResourceBundle stackTempBundle;
ResourceDataValue value;
ures_getValueWithFallback(langInfoBundle, "likely", stackTempBundle.getAlias(),
value, errorCode);
ResourceTable likelyTable = value.getTable(errorCode);
if (U_FAILURE(errorCode)) { return; }
// Read all strings in the resource bundle and convert them to invariant char *.
LocalMemory<int32_t> languageIndexes, regionIndexes, lsrSubtagIndexes;
int32_t languagesLength = 0, regionsLength = 0, lsrSubtagsLength = 0;
ResourceArray m49Array;
if (likelyTable.findValue("m49", value)) {
m49Array = value.getArray(errorCode);
} else {
errorCode = U_MISSING_RESOURCE_ERROR;
return;
}
if (!readStrings(likelyTable, "languageAliases", value,
languageIndexes, languagesLength, errorCode) ||
!readStrings(likelyTable, "regionAliases", value,
regionIndexes, regionsLength, errorCode) ||
!readLSREncodedStrings(likelyTable, "lsrnum", value, m49Array,
lsrSubtagIndexes,lsrSubtagsLength, errorCode)) {
return;
}
if ((languagesLength & 1) != 0 ||
(regionsLength & 1) != 0 ||
(lsrSubtagsLength % 3) != 0) {
errorCode = U_INVALID_FORMAT_ERROR;
return;
}
if (lsrSubtagsLength == 0) {
errorCode = U_MISSING_RESOURCE_ERROR;
return;
}
if (!likelyTable.findValue("trie", value)) {
errorCode = U_MISSING_RESOURCE_ERROR;
return;
}
int32_t length;
trieBytes = value.getBinary(length, errorCode);
if (U_FAILURE(errorCode)) { return; }
// Also read distance/matcher data if available,
// to open & keep only one resource bundle pointer
// and to use one single UniqueCharStrings.
UErrorCode matchErrorCode = U_ZERO_ERROR;
ures_getValueWithFallback(langInfoBundle, "match", stackTempBundle.getAlias(),
value, matchErrorCode);
LocalMemory<int32_t> partitionIndexes, paradigmSubtagIndexes;
int32_t partitionsLength = 0, paradigmSubtagsLength = 0;
if (U_SUCCESS(matchErrorCode)) {
ResourceTable matchTable = value.getTable(errorCode);
if (U_FAILURE(errorCode)) { return; }
if (matchTable.findValue("trie", value)) {
distanceData.distanceTrieBytes = value.getBinary(length, errorCode);
if (U_FAILURE(errorCode)) { return; }
}
if (matchTable.findValue("regionToPartitions", value)) {
distanceData.regionToPartitions = value.getBinary(length, errorCode);
if (U_FAILURE(errorCode)) { return; }
if (length < LSR::REGION_INDEX_LIMIT) {
errorCode = U_INVALID_FORMAT_ERROR;
return;
}
}
if (!readStrings(matchTable, "partitions", value,
partitionIndexes, partitionsLength, errorCode) ||
!readLSREncodedStrings(matchTable, "paradigmnum", value, m49Array,
paradigmSubtagIndexes, paradigmSubtagsLength, errorCode)) {
return;
}
if ((paradigmSubtagsLength % 3) != 0) {
errorCode = U_INVALID_FORMAT_ERROR;
return;
}
if (matchTable.findValue("distances", value)) {
distanceData.distances = value.getIntVector(length, errorCode);
if (U_FAILURE(errorCode)) { return; }
if (length < 4) { // LocaleDistance IX_LIMIT
errorCode = U_INVALID_FORMAT_ERROR;
return;
}
}
} else if (matchErrorCode == U_MISSING_RESOURCE_ERROR) {
// ok for likely subtags
} else { // error other than missing resource
errorCode = matchErrorCode;
return;
}
// Fetch & store invariant-character versions of strings
// only after we have collected and de-duplicated all of them.
strings.freeze();
languageAliases = CharStringMap(languagesLength / 2, errorCode);
for (int32_t i = 0; i < languagesLength; i += 2) {
languageAliases.put(strings.get(languageIndexes[i]),
strings.get(languageIndexes[i + 1]), errorCode);
}
regionAliases = CharStringMap(regionsLength / 2, errorCode);
for (int32_t i = 0; i < regionsLength; i += 2) {
regionAliases.put(strings.get(regionIndexes[i]),
strings.get(regionIndexes[i + 1]), errorCode);
}
if (U_FAILURE(errorCode)) { return; }
lsrsLength = lsrSubtagsLength / 3;
lsrs = new LSR[lsrsLength];
if (lsrs == nullptr) {
errorCode = U_MEMORY_ALLOCATION_ERROR;
return;
}
for (int32_t i = 0, j = 0; i < lsrSubtagsLength; i += 3, ++j) {
lsrs[j] = LSR(strings.get(lsrSubtagIndexes[i]),
strings.get(lsrSubtagIndexes[i + 1]),
strings.get(lsrSubtagIndexes[i + 2]),
LSR::IMPLICIT_LSR);
}
if (partitionsLength > 0) {
distanceData.partitions = static_cast<const char **>(
uprv_malloc(partitionsLength * sizeof(const char *)));
if (distanceData.partitions == nullptr) {
errorCode = U_MEMORY_ALLOCATION_ERROR;
return;
}
for (int32_t i = 0; i < partitionsLength; ++i) {
distanceData.partitions[i] = strings.get(partitionIndexes[i]);
}
}
if (paradigmSubtagsLength > 0) {
distanceData.paradigmsLength = paradigmSubtagsLength / 3;
LSR *paradigms = new LSR[distanceData.paradigmsLength];
if (paradigms == nullptr) {
errorCode = U_MEMORY_ALLOCATION_ERROR;
return;
}
for (int32_t i = 0, j = 0; i < paradigmSubtagsLength; i += 3, ++j) {
paradigms[j] = LSR(strings.get(paradigmSubtagIndexes[i]),
strings.get(paradigmSubtagIndexes[i + 1]),
strings.get(paradigmSubtagIndexes[i + 2]),
LSR::DONT_CARE_FLAGS);
}
distanceData.paradigms = paradigms;
}
}
private:
bool readStrings(const ResourceTable &table, const char *key, ResourceValue &value,
LocalMemory<int32_t> &indexes, int32_t &length, UErrorCode &errorCode) {
if (U_FAILURE(errorCode)) { return false; }
if (table.findValue(key, value)) {
ResourceArray stringArray = value.getArray(errorCode);
if (U_FAILURE(errorCode)) { return false; }
length = stringArray.getSize();
if (length == 0) { return true; }
int32_t *rawIndexes = indexes.allocateInsteadAndCopy(length);
if (rawIndexes == nullptr) {
errorCode = U_MEMORY_ALLOCATION_ERROR;
return false;
}
for (int i = 0; i < length; ++i) {
if (stringArray.getValue(i, value)) { // returns true because i < length
int32_t strLength = 0;
rawIndexes[i] = strings.add(value.getString(strLength, errorCode), errorCode);
if (U_FAILURE(errorCode)) { return false; }
}
}
}
return true;
}
UnicodeString toLanguage(int encoded) {
if (encoded == 0) {
return UNICODE_STRING_SIMPLE("");
}
if (encoded == 1) {
return UNICODE_STRING_SIMPLE("skip");
}
encoded &= 0x00ffffff;
encoded %= 27*27*27;
char lang[3];
lang[0] = 'a' + ((encoded % 27) - 1);
lang[1] = 'a' + (((encoded / 27 ) % 27) - 1);
if (encoded / (27 * 27) == 0) {
return UnicodeString(lang, 2, US_INV);
}
lang[2] = 'a' + ((encoded / (27 * 27)) - 1);
return UnicodeString(lang, 3, US_INV);
}
UnicodeString toScript(int encoded) {
if (encoded == 0) {
return UNICODE_STRING_SIMPLE("");
}
if (encoded == 1) {
return UNICODE_STRING_SIMPLE("script");
}
encoded = (encoded >> 24) & 0x000000ff;
const char* script = uscript_getShortName(static_cast<UScriptCode>(encoded));
if (script == nullptr) {
return UNICODE_STRING_SIMPLE("");
}
U_ASSERT(uprv_strlen(script) == 4);
return UnicodeString(script, 4, US_INV);
}
UnicodeString m49IndexToCode(const ResourceArray &m49Array, ResourceValue &value, int index, UErrorCode &errorCode) {
if (U_FAILURE(errorCode)) {
return UNICODE_STRING_SIMPLE("");
}
if (m49Array.getValue(index, value)) {
return value.getUnicodeString(errorCode);
}
// "m49" does not include the index.
errorCode = U_MISSING_RESOURCE_ERROR;
return UNICODE_STRING_SIMPLE("");
}
UnicodeString toRegion(const ResourceArray& m49Array, ResourceValue &value, int encoded, UErrorCode &errorCode) {
if (U_FAILURE(errorCode) || encoded == 0 || encoded == 1) {
return UNICODE_STRING_SIMPLE("");
}
encoded &= 0x00ffffff;
encoded /= 27 * 27 * 27;
encoded %= 27 * 27;
if (encoded < 27) {
// Selected M49 code index, find the code from "m49" resource.
return m49IndexToCode(m49Array, value, encoded, errorCode);
}
char region[2];
region[0] = 'A' + ((encoded % 27) - 1);
region[1] = 'A' + (((encoded / 27) % 27) - 1);
return UnicodeString(region, 2, US_INV);
}
bool readLSREncodedStrings(const ResourceTable &table, const char* key, ResourceValue &value, const ResourceArray& m49Array,
LocalMemory<int32_t> &indexes, int32_t &length, UErrorCode &errorCode) {
if (U_FAILURE(errorCode)) { return false; }
if (table.findValue(key, value)) {
const int32_t* vectors = value.getIntVector(length, errorCode);
if (U_FAILURE(errorCode)) { return false; }
if (length == 0) { return true; }
int32_t *rawIndexes = indexes.allocateInsteadAndCopy(length * 3);
if (rawIndexes == nullptr) {
errorCode = U_MEMORY_ALLOCATION_ERROR;
return false;
}
for (int i = 0; i < length; ++i) {
rawIndexes[i*3] = strings.addByValue(toLanguage(vectors[i]), errorCode);
rawIndexes[i*3+1] = strings.addByValue(toScript(vectors[i]), errorCode);
rawIndexes[i*3+2] = strings.addByValue(
toRegion(m49Array, value, vectors[i], errorCode), errorCode);
if (U_FAILURE(errorCode)) { return false; }
}
length *= 3;
}
return true;
}
};
namespace {
LikelySubtags *gLikelySubtags = nullptr;
UVector *gMacroregions = nullptr;
UInitOnce gInitOnce {};
UBool U_CALLCONV cleanup() {
delete gLikelySubtags;
gLikelySubtags = nullptr;
delete gMacroregions;
gMacroregions = nullptr;
gInitOnce.reset();
return true;
}
constexpr const char16_t* MACROREGION_HARDCODE[] = {
u"001~3",
u"005",
u"009",
u"011",
u"013~5",
u"017~9",
u"021",
u"029",
u"030",
u"034~5",
u"039",
u"053~4",
u"057",
u"061",
u"142~3",
u"145",
u"150~1",
u"154~5",
u"202",
u"419",
u"EU",
u"EZ",
u"QO",
u"UN",
};
constexpr char16_t RANGE_MARKER = 0x7E; /* '~' */
void processMacroregionRange(const UnicodeString& regionName, UVector* newMacroRegions, UErrorCode& status) {
if (U_FAILURE(status)) { return; }
int32_t rangeMarkerLocation = regionName.indexOf(RANGE_MARKER);
char16_t buf[6];
regionName.extract(buf,6,status);
if ( rangeMarkerLocation > 0 ) {
char16_t endRange = regionName.charAt(rangeMarkerLocation+1);
buf[rangeMarkerLocation] = 0;
while ( buf[rangeMarkerLocation-1] <= endRange && U_SUCCESS(status)) {
LocalPointer<UnicodeString> newRegion(new UnicodeString(buf), status);
newMacroRegions->adoptElement(newRegion.orphan(),status);
buf[rangeMarkerLocation-1]++;
}
} else {
LocalPointer<UnicodeString> newRegion(new UnicodeString(regionName), status);
newMacroRegions->adoptElement(newRegion.orphan(),status);
}
}
#if U_DEBUG
UVector* loadMacroregions(UErrorCode &status) {
if (U_FAILURE(status)) { return nullptr; }
LocalPointer<UVector> newMacroRegions(new UVector(uprv_deleteUObject, uhash_compareUnicodeString, status), status);
LocalUResourceBundlePointer supplementalData(ures_openDirect(nullptr,"supplementalData",&status));
LocalUResourceBundlePointer idValidity(ures_getByKey(supplementalData.getAlias(),"idValidity",nullptr,&status));
LocalUResourceBundlePointer regionList(ures_getByKey(idValidity.getAlias(),"region",nullptr,&status));
LocalUResourceBundlePointer regionMacro(ures_getByKey(regionList.getAlias(),"macroregion",nullptr,&status));
if (U_FAILURE(status)) {
return nullptr;
}
while (ures_hasNext(regionMacro.getAlias())) {
UnicodeString regionName = ures_getNextUnicodeString(regionMacro.getAlias(),nullptr,&status);
processMacroregionRange(regionName, newMacroRegions.getAlias(), status);
if (U_FAILURE(status)) {
return nullptr;
}
}
return newMacroRegions.orphan();
}
#endif // U_DEBUG
UVector* getStaticMacroregions(UErrorCode &status) {
if (U_FAILURE(status)) { return nullptr; }
LocalPointer<UVector> newMacroRegions(new UVector(uprv_deleteUObject, uhash_compareUnicodeString, status), status);
if (U_FAILURE(status)) {
return nullptr;
}
for (const auto *region : MACROREGION_HARDCODE) {
UnicodeString regionName(region);
processMacroregionRange(regionName, newMacroRegions.getAlias(), status);
if (U_FAILURE(status)) {
return nullptr;
}
}
return newMacroRegions.orphan();
}
} // namespace
void U_CALLCONV LikelySubtags::initLikelySubtags(UErrorCode &errorCode) {
// This function is invoked only via umtx_initOnce().
U_ASSERT(gLikelySubtags == nullptr);
LikelySubtagsData data(errorCode);
data.load(errorCode);
if (U_FAILURE(errorCode)) { return; }
gLikelySubtags = new LikelySubtags(data);
gMacroregions = getStaticMacroregions(errorCode);
#if U_DEBUG
auto macroregionsFromData = loadMacroregions(errorCode);
U_ASSERT((*gMacroregions) == (*macroregionsFromData));
delete macroregionsFromData;
#endif
if (U_FAILURE(errorCode) || gLikelySubtags == nullptr || gMacroregions == nullptr) {
delete gLikelySubtags;
delete gMacroregions;
errorCode = U_MEMORY_ALLOCATION_ERROR;
return;
}
ucln_common_registerCleanup(UCLN_COMMON_LIKELY_SUBTAGS, cleanup);
}
const LikelySubtags *LikelySubtags::getSingleton(UErrorCode &errorCode) {
if (U_FAILURE(errorCode)) { return nullptr; }
umtx_initOnce(gInitOnce, &LikelySubtags::initLikelySubtags, errorCode);
return gLikelySubtags;
}
LikelySubtags::LikelySubtags(LikelySubtagsData &data) :
langInfoBundle(data.langInfoBundle),
strings(data.strings.orphanCharStrings()),
languageAliases(std::move(data.languageAliases)),
regionAliases(std::move(data.regionAliases)),
trie(data.trieBytes),
lsrs(data.lsrs),
#if U_DEBUG
lsrsLength(data.lsrsLength),
#endif // U_DEBUG
distanceData(std::move(data.distanceData)) {
data.langInfoBundle = nullptr;
data.lsrs = nullptr;
// Cache the result of looking up language="und" encoded as "*", and "und-Zzzz" ("**").
UStringTrieResult result = trie.next(u'*');
U_ASSERT(USTRINGTRIE_HAS_NEXT(result));
trieUndState = trie.getState64();
result = trie.next(u'*');
U_ASSERT(USTRINGTRIE_HAS_NEXT(result));
trieUndZzzzState = trie.getState64();
result = trie.next(u'*');
U_ASSERT(USTRINGTRIE_HAS_VALUE(result));
defaultLsrIndex = trie.getValue();
trie.reset();
for (char16_t c = u'a'; c <= u'z'; ++c) {
result = trie.next(c);
if (result == USTRINGTRIE_NO_VALUE) {
trieFirstLetterStates[c - u'a'] = trie.getState64();
}
trie.reset();
}
}
LikelySubtags::~LikelySubtags() {
ures_close(langInfoBundle);
delete strings;
delete[] lsrs;
}
LSR LikelySubtags::makeMaximizedLsrFrom(const Locale &locale,
bool returnInputIfUnmatch,
UErrorCode &errorCode) const {
if (U_FAILURE(errorCode)) { return {}; }
if (locale.isBogus()) {
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
return {};
}
const char *name = locale.getName();
if (uprv_isAtSign(name[0]) && name[1] == 'x' && name[2] == '=') { // name.startsWith("@x=")
// Private use language tag x-subtag-subtag... which CLDR changes to
// und-x-subtag-subtag...
return LSR(name, "", "", LSR::EXPLICIT_LSR);
}
LSR max = makeMaximizedLsr(locale.getLanguage(), locale.getScript(), locale.getCountry(),
locale.getVariant(), returnInputIfUnmatch, errorCode);
if (uprv_strlen(max.language) == 0 &&
uprv_strlen(max.script) == 0 &&
uprv_strlen(max.region) == 0) {
// No match. ICU API mandate us to
// If the provided ULocale instance is already in the maximal form, or
// there is no data available available for maximization, it will be
// returned.
return LSR(locale.getLanguage(), locale.getScript(), locale.getCountry(), LSR::EXPLICIT_LSR, errorCode);
}
return max;
}
namespace {
const char *getCanonical(const CharStringMap &aliases, const char *alias) {
const char *canonical = aliases.get(alias);
return canonical == nullptr ? alias : canonical;
}
} // namespace
LSR LikelySubtags::makeMaximizedLsr(const char *language, const char *script, const char *region,
const char *variant,
bool returnInputIfUnmatch,
UErrorCode &errorCode) const {
if (U_FAILURE(errorCode)) { return {}; }
// Handle pseudolocales like en-XA, ar-XB, fr-PSCRACK.
// They should match only themselves,
// not other locales with what looks like the same language and script subtags.
char c1;
if (region[0] == 'X' && (c1 = region[1]) != 0 && region[2] == 0) {
switch (c1) {
case 'A':
if (returnInputIfUnmatch) {
return LSR(language, script, region, LSR::EXPLICIT_LSR);
}
return LSR(PSEUDO_ACCENTS_PREFIX, language, script, region,
LSR::EXPLICIT_LSR, errorCode);
case 'B':
if (returnInputIfUnmatch) {
return LSR(language, script, region, LSR::EXPLICIT_LSR);
}
return LSR(PSEUDO_BIDI_PREFIX, language, script, region,
LSR::EXPLICIT_LSR, errorCode);
case 'C':
if (returnInputIfUnmatch) {
return LSR(language, script, region, LSR::EXPLICIT_LSR);
}
return LSR(PSEUDO_CRACKED_PREFIX, language, script, region,
LSR::EXPLICIT_LSR, errorCode);
default: // normal locale
break;
}
}
if (variant[0] == 'P' && variant[1] == 'S') {
int32_t lsrFlags = *region == 0 ?
LSR::EXPLICIT_LANGUAGE | LSR::EXPLICIT_SCRIPT : LSR::EXPLICIT_LSR;
if (uprv_strcmp(variant, "PSACCENT") == 0) {
return LSR(PSEUDO_ACCENTS_PREFIX, language, script,
*region == 0 ? "XA" : region, lsrFlags, errorCode);
} else if (uprv_strcmp(variant, "PSBIDI") == 0) {
return LSR(PSEUDO_BIDI_PREFIX, language, script,
*region == 0 ? "XB" : region, lsrFlags, errorCode);
} else if (uprv_strcmp(variant, "PSCRACK") == 0) {
return LSR(PSEUDO_CRACKED_PREFIX, language, script,
*region == 0 ? "XC" : region, lsrFlags, errorCode);
}
// else normal locale
}
language = getCanonical(languageAliases, language);
// (We have no script mappings.)
region = getCanonical(regionAliases, region);
return maximize(language, script, region, returnInputIfUnmatch, errorCode);
}
LSR LikelySubtags::maximize(const char *language, const char *script, const char *region,
bool returnInputIfUnmatch,
UErrorCode &errorCode) const {
if (U_FAILURE(errorCode)) { return {}; }
return maximize({language, (int32_t)uprv_strlen(language)},
{script, (int32_t)uprv_strlen(script)},
{region, (int32_t)uprv_strlen(region)},
returnInputIfUnmatch,
errorCode);
}
bool LikelySubtags::isMacroregion(StringPiece& region, UErrorCode& errorCode) const {
if (U_FAILURE(errorCode)) { return false; }
// In Java, we use Region class. In C++, since Region is under i18n,
// we read the same data used by Region into gMacroregions avoid dependency
// from common to i18n/region.cpp
umtx_initOnce(gInitOnce, &LikelySubtags::initLikelySubtags, errorCode);
if (U_FAILURE(errorCode)) { return false; }
UnicodeString str(UnicodeString::fromUTF8(region));
return gMacroregions->contains((void *)&str);
}
LSR LikelySubtags::maximize(StringPiece language, StringPiece script, StringPiece region,
bool returnInputIfUnmatch,
UErrorCode &errorCode) const {
if (U_FAILURE(errorCode)) { return {}; }
if (language.compare("und") == 0) {
language = "";
}
if (script.compare("Zzzz") == 0) {
script = "";
}
if (region.compare("ZZ") == 0) {
region = "";
}
if (!script.empty() && !region.empty() && !language.empty()) {
return LSR(language, script, region, LSR::EXPLICIT_LSR, errorCode); // already maximized
}
bool retainLanguage = false;
bool retainScript = false;
bool retainRegion = false;
BytesTrie iter(trie);
uint64_t state;
int32_t value;
// Small optimization: Array lookup for first language letter.
int32_t c0;
if (0 <= (c0 = uprv_lowerOrdinal(language.data()[0])) && c0 <= 25 &&
language.length() >= 2 &&
(state = trieFirstLetterStates[c0]) != 0) {
value = trieNext(iter.resetToState64(state), language, 1);
} else {
value = trieNext(iter, language, 0);
}
bool matchLanguage = (value >= 0);
bool matchScript = false;
if (value >= 0) {
retainLanguage = !language.empty();
state = iter.getState64();
} else {
retainLanguage = true;
iter.resetToState64(trieUndState); // "und" ("*")
state = 0;
}
if (value >= 0 && !script.empty()) {
matchScript = true;
}
if (value > 0) {
// Intermediate or final value from just language.
if (value == SKIP_SCRIPT) {
value = 0;
}
retainScript = !script.empty();
} else {
value = trieNext(iter, script, 0);
if (value >= 0) {
retainScript = !script.empty();
state = iter.getState64();
} else {
retainScript = true;
if (state == 0) {
iter.resetToState64(trieUndZzzzState); // "und-Zzzz" ("**")
} else {
iter.resetToState64(state);
value = trieNext(iter, "", 0);
U_ASSERT(value >= 0);
state = iter.getState64();
}
}
}
bool matchRegion = false;
if (value > 0) {
// Final value from just language or language+script.
retainRegion = !region.empty();
} else {
value = trieNext(iter, region, 0);
if (value >= 0) {
if (!region.empty() && !isMacroregion(region, errorCode)) {
retainRegion = true;
matchRegion = true;
}
} else {
retainRegion = true;
if (state == 0) {
value = defaultLsrIndex;
} else {
iter.resetToState64(state);
value = trieNext(iter, "", 0);
U_ASSERT(value > 0);
}
}
}
U_ASSERT(value < lsrsLength);
const LSR &matched = lsrs[value];
if (returnInputIfUnmatch &&
(!(matchLanguage || matchScript || (matchRegion && language.empty())))) {
return LSR("", "", "", LSR::EXPLICIT_LSR, errorCode); // no matching.
}
if (language.empty()) {
language = StringPiece("und");
}
if (!(retainLanguage || retainScript || retainRegion)) {
// Quickly return a copy of the lookup-result LSR
// without new allocation of the subtags.
return LSR(matched.language, matched.script, matched.region, matched.flags);
}
if (!retainLanguage) {
language = matched.language;
}
if (!retainScript) {
script = matched.script;
}
if (!retainRegion) {
region = matched.region;
}
int32_t retainMask = (retainLanguage ? 4 : 0) + (retainScript ? 2 : 0) + (retainRegion ? 1 : 0);
// retainOldMask flags = LSR explicit-subtag flags
return LSR(language, script, region, retainMask, errorCode);
}
int32_t LikelySubtags::compareLikely(const LSR &lsr, const LSR &other, int32_t likelyInfo) const {
// If likelyInfo >= 0:
// likelyInfo bit 1 is set if the previous comparison with lsr
// was for equal language and script.
// Otherwise the scripts differed.
if (uprv_strcmp(lsr.language, other.language) != 0) {
return 0xfffffffc; // negative, lsr not better than other
}
if (uprv_strcmp(lsr.script, other.script) != 0) {
int32_t index;
if (likelyInfo >= 0 && (likelyInfo & 2) == 0) {
index = likelyInfo >> 2;
} else {
index = getLikelyIndex(lsr.language, "");
likelyInfo = index << 2;
}
const LSR &likely = lsrs[index];
if (uprv_strcmp(lsr.script, likely.script) == 0) {
return likelyInfo | 1;
} else {
return likelyInfo & ~1;
}
}
if (uprv_strcmp(lsr.region, other.region) != 0) {
int32_t index;
if (likelyInfo >= 0 && (likelyInfo & 2) != 0) {
index = likelyInfo >> 2;
} else {
index = getLikelyIndex(lsr.language, lsr.region);
likelyInfo = (index << 2) | 2;
}
const LSR &likely = lsrs[index];
if (uprv_strcmp(lsr.region, likely.region) == 0) {
return likelyInfo | 1;
} else {
return likelyInfo & ~1;
}
}
return likelyInfo & ~1; // lsr not better than other
}
// Subset of maximize().
int32_t LikelySubtags::getLikelyIndex(const char *language, const char *script) const {
if (uprv_strcmp(language, "und") == 0) {
language = "";
}
if (uprv_strcmp(script, "Zzzz") == 0) {
script = "";
}
BytesTrie iter(trie);
uint64_t state;
int32_t value;
// Small optimization: Array lookup for first language letter.
int32_t c0;
if (0 <= (c0 = uprv_lowerOrdinal(language[0])) && c0 <= 25 &&
language[1] != 0 && // language.length() >= 2
(state = trieFirstLetterStates[c0]) != 0) {
value = trieNext(iter.resetToState64(state), language, 1);
} else {
value = trieNext(iter, language, 0);
}
if (value >= 0) {
state = iter.getState64();
} else {
iter.resetToState64(trieUndState); // "und" ("*")
state = 0;
}
if (value > 0) {
// Intermediate or final value from just language.
if (value == SKIP_SCRIPT) {
value = 0;
}
} else {
value = trieNext(iter, script, 0);
if (value >= 0) {
state = iter.getState64();
} else {
if (state == 0) {
iter.resetToState64(trieUndZzzzState); // "und-Zzzz" ("**")
} else {
iter.resetToState64(state);
value = trieNext(iter, "", 0);
U_ASSERT(value >= 0);
state = iter.getState64();
}
}
}
if (value > 0) {
// Final value from just language or language+script.
} else {
value = trieNext(iter, "", 0);
U_ASSERT(value > 0);
}
U_ASSERT(value < lsrsLength);
return value;
}
int32_t LikelySubtags::trieNext(BytesTrie &iter, const char *s, int32_t i) {
UStringTrieResult result;
uint8_t c;
if ((c = s[i]) == 0) {
result = iter.next(u'*');
} else {
for (;;) {
c = uprv_invCharToAscii(c);
// EBCDIC: If s[i] is not an invariant character,
// then c is now 0 and will simply not match anything, which is harmless.
uint8_t next = s[++i];
if (next != 0) {
if (!USTRINGTRIE_HAS_NEXT(iter.next(c))) {
return -1;
}
} else {
// last character of this subtag
result = iter.next(c | 0x80);
break;
}
c = next;
}
}
switch (result) {
case USTRINGTRIE_NO_MATCH: return -1;
case USTRINGTRIE_NO_VALUE: return 0;
case USTRINGTRIE_INTERMEDIATE_VALUE:
U_ASSERT(iter.getValue() == SKIP_SCRIPT);
return SKIP_SCRIPT;
case USTRINGTRIE_FINAL_VALUE: return iter.getValue();
default: return -1;
}
}
int32_t LikelySubtags::trieNext(BytesTrie &iter, StringPiece s, int32_t i) {
UStringTrieResult result;
uint8_t c;
if (s.length() == i) {
result = iter.next(u'*');
} else {
c = s.data()[i];
for (;;) {
c = uprv_invCharToAscii(c);
// EBCDIC: If s[i] is not an invariant character,
// then c is now 0 and will simply not match anything, which is harmless.
if (i+1 != s.length()) {
if (!USTRINGTRIE_HAS_NEXT(iter.next(c))) {
return -1;
}
c = s.data()[++i];
} else {
// last character of this subtag
result = iter.next(c | 0x80);
break;
}
}
}
switch (result) {
case USTRINGTRIE_NO_MATCH: return -1;
case USTRINGTRIE_NO_VALUE: return 0;
case USTRINGTRIE_INTERMEDIATE_VALUE:
U_ASSERT(iter.getValue() == SKIP_SCRIPT);
return SKIP_SCRIPT;
case USTRINGTRIE_FINAL_VALUE: return iter.getValue();
default: return -1;
}
}
LSR LikelySubtags::minimizeSubtags(StringPiece language, StringPiece script,
StringPiece region,
bool favorScript,
UErrorCode &errorCode) const {
if (U_FAILURE(errorCode)) { return {}; }
LSR max = maximize(language, script, region, true, errorCode);
if (U_FAILURE(errorCode)) { return {}; }
// If no match, return it.
if (uprv_strlen(max.language) == 0 &&
uprv_strlen(max.script) == 0 &&
uprv_strlen(max.region) == 0) {
// No match. ICU API mandate us to
// "If this Locale is already in the minimal form, or not valid, or
// there is no data available for minimization, the Locale will be
// unchanged."
return LSR(language, script, region, LSR::EXPLICIT_LSR, errorCode);
}
// try language
LSR test = maximize(max.language, "", "", true, errorCode);
if (U_FAILURE(errorCode)) { return {}; }
if (test.isEquivalentTo(max)) {
return LSR(max.language, "", "", LSR::DONT_CARE_FLAGS, errorCode);
}
if (!favorScript) {
// favor Region
// try language and region
test = maximize(max.language, "", max.region, true, errorCode);
if (U_FAILURE(errorCode)) { return {}; }
if (test.isEquivalentTo(max)) {
return LSR(max.language, "", max.region, LSR::DONT_CARE_FLAGS, errorCode);
}
}
// try language and script
test = maximize(max.language, max.script, "", true, errorCode);
if (U_FAILURE(errorCode)) { return {}; }
if (test.isEquivalentTo(max)) {
return LSR(max.language, max.script, "", LSR::DONT_CARE_FLAGS, errorCode);
}
if (favorScript) {
// try language and region
test = maximize(max.language, "", max.region, true, errorCode);
if (U_FAILURE(errorCode)) { return {}; }
if (test.isEquivalentTo(max)) {
return LSR(max.language, "", max.region, LSR::DONT_CARE_FLAGS, errorCode);
}
}
return LSR(max.language, max.script, max.region, LSR::DONT_CARE_FLAGS, errorCode);
}
U_NAMESPACE_END

View file

@ -0,0 +1,128 @@
// © 2019 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
// loclikelysubtags.h
// created: 2019may08 Markus W. Scherer
#ifndef __LOCLIKELYSUBTAGS_H__
#define __LOCLIKELYSUBTAGS_H__
#include <utility>
#include "unicode/utypes.h"
#include "unicode/bytestrie.h"
#include "unicode/locid.h"
#include "unicode/stringpiece.h"
#include "unicode/uobject.h"
#include "unicode/ures.h"
#include "charstrmap.h"
#include "lsr.h"
U_NAMESPACE_BEGIN
struct LikelySubtagsData;
struct LocaleDistanceData {
LocaleDistanceData() = default;
LocaleDistanceData(LocaleDistanceData &&data);
~LocaleDistanceData();
const uint8_t *distanceTrieBytes = nullptr;
const uint8_t *regionToPartitions = nullptr;
const char **partitions = nullptr;
const LSR *paradigms = nullptr;
int32_t paradigmsLength = 0;
const int32_t *distances = nullptr;
private:
LocaleDistanceData &operator=(const LocaleDistanceData &) = delete;
};
class LikelySubtags final : public UMemory {
public:
~LikelySubtags();
static constexpr int32_t SKIP_SCRIPT = 1;
// VisibleForTesting
static const LikelySubtags *getSingleton(UErrorCode &errorCode);
// VisibleForTesting
LSR makeMaximizedLsrFrom(const Locale &locale,
bool returnInputIfUnmatch,
UErrorCode &errorCode) const;
/**
* Tests whether lsr is "more likely" than other.
* For example, fr-Latn-FR is more likely than fr-Latn-CH because
* FR is the default region for fr-Latn.
*
* The likelyInfo caches lookup information between calls.
* The return value is an updated likelyInfo value,
* with bit 0 set if lsr is "more likely".
* The initial value of likelyInfo must be negative.
*/
int32_t compareLikely(const LSR &lsr, const LSR &other, int32_t likelyInfo) const;
LSR minimizeSubtags(StringPiece language, StringPiece script, StringPiece region,
bool favorScript,
UErrorCode &errorCode) const;
// visible for LocaleDistance
const LocaleDistanceData &getDistanceData() const { return distanceData; }
private:
LikelySubtags(LikelySubtagsData &data);
LikelySubtags(const LikelySubtags &other) = delete;
LikelySubtags &operator=(const LikelySubtags &other) = delete;
static void initLikelySubtags(UErrorCode &errorCode);
LSR makeMaximizedLsr(const char *language, const char *script, const char *region,
const char *variant,
bool returnInputIfUnmatch,
UErrorCode &errorCode) const;
/**
* Raw access to addLikelySubtags. Input must be in canonical format, eg "en", not "eng" or "EN".
*/
LSR maximize(const char *language, const char *script, const char *region,
bool returnInputIfUnmatch,
UErrorCode &errorCode) const;
LSR maximize(StringPiece language, StringPiece script, StringPiece region,
bool returnInputIfUnmatch,
UErrorCode &errorCode) const;
int32_t getLikelyIndex(const char *language, const char *script) const;
bool isMacroregion(StringPiece& region, UErrorCode &errorCode) const;
static int32_t trieNext(BytesTrie &iter, const char *s, int32_t i);
static int32_t trieNext(BytesTrie &iter, StringPiece s, int32_t i);
UResourceBundle *langInfoBundle;
// We could store the strings by value, except that if there were few enough strings,
// moving the contents could copy it to a different array,
// invalidating the pointers stored in the maps.
CharString *strings;
CharStringMap languageAliases;
CharStringMap regionAliases;
// The trie maps each lang+script+region (encoded in ASCII) to an index into lsrs.
// There is also a trie value for each intermediate lang and lang+script.
// '*' is used instead of "und", "Zzzz"/"" and "ZZ"/"".
BytesTrie trie;
uint64_t trieUndState;
uint64_t trieUndZzzzState;
int32_t defaultLsrIndex;
uint64_t trieFirstLetterStates[26];
const LSR *lsrs;
#if U_DEBUG
int32_t lsrsLength;
#endif
// distance/matcher data: see comment in LikelySubtagsData::load()
LocaleDistanceData distanceData;
};
U_NAMESPACE_END
#endif // __LOCLIKELYSUBTAGS_H__

1315
engine/thirdparty/icu4c/common/locmap.cpp vendored Normal file

File diff suppressed because it is too large Load diff

40
engine/thirdparty/icu4c/common/locmap.h vendored Normal file
View file

@ -0,0 +1,40 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
******************************************************************************
*
* Copyright (C) 1996-2013, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
*
* File locmap.h : Locale Mapping Classes
*
*
* Created by: Helena Shih
*
* Modification History:
*
* Date Name Description
* 3/11/97 aliu Added setId().
* 4/20/99 Madhu Added T_convertToPosix()
* 09/18/00 george Removed the memory leaks.
* 08/23/01 george Convert to C
*============================================================================
*/
#ifndef LOCMAP_H
#define LOCMAP_H
#include "unicode/utypes.h"
#define LANGUAGE_LCID(hostID) (uint16_t)(0x03FF & hostID)
U_CAPI int32_t uprv_convertToPosix(uint32_t hostid, char* posixID, int32_t posixIDCapacity, UErrorCode* status);
/* Don't call these functions directly. Use uloc_getLCID instead. */
U_CAPI uint32_t uprv_convertToLCIDPlatform(const char* localeID, UErrorCode* status); // Leverage platform conversion if possible
U_CAPI uint32_t uprv_convertToLCID(const char* langID, const char* posixID, UErrorCode* status);
#endif /* LOCMAP_H */

View file

@ -0,0 +1,226 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
*
* Copyright (C) 1997-2012, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: loclikely.cpp
* encoding: UTF-8
* tab size: 8 (not used)
* indentation:4
*
* created on: 2010feb25
* created by: Markus W. Scherer
*
* Code for miscellaneous locale-related resource bundle data access,
* separated out from other .cpp files
* that then do not depend on resource bundle code and this data.
*/
#include "unicode/utypes.h"
#include "unicode/putil.h"
#include "unicode/uloc.h"
#include "unicode/ures.h"
#include "charstr.h"
#include "cstring.h"
#include "ulocimp.h"
#include "uresimp.h"
/*
* Lookup a resource bundle table item with fallback on the table level.
* Regular resource bundle lookups perform fallback to parent locale bundles
* and eventually the root bundle, but only for top-level items.
* This function takes the name of a top-level table and of an item in that table
* and performs a lookup of both, falling back until a bundle contains a table
* with this item.
*
* Note: Only the opening of entire bundles falls back through the default locale
* before root. Once a bundle is open, item lookups do not go through the
* default locale because that would result in a mix of languages that is
* unpredictable to the programmer and most likely useless.
*/
U_CAPI const char16_t * U_EXPORT2
uloc_getTableStringWithFallback(const char *path, const char *locale,
const char *tableKey, const char *subTableKey,
const char *itemKey,
int32_t *pLength,
UErrorCode *pErrorCode)
{
if (U_FAILURE(*pErrorCode)) { return nullptr; }
/* char localeBuffer[ULOC_FULLNAME_CAPACITY*4];*/
const char16_t *item=nullptr;
UErrorCode errorCode;
/*
* open the bundle for the current locale
* this falls back through the locale's chain to root
*/
errorCode=U_ZERO_ERROR;
icu::LocalUResourceBundlePointer rb(ures_open(path, locale, &errorCode));
if(U_FAILURE(errorCode)) {
/* total failure, not even root could be opened */
*pErrorCode=errorCode;
return nullptr;
} else if(errorCode==U_USING_DEFAULT_WARNING ||
(errorCode==U_USING_FALLBACK_WARNING && *pErrorCode!=U_USING_DEFAULT_WARNING)
) {
/* set the "strongest" error code (success->fallback->default->failure) */
*pErrorCode=errorCode;
}
for(;;){
icu::StackUResourceBundle table;
icu::StackUResourceBundle subTable;
ures_getByKeyWithFallback(rb.getAlias(), tableKey, table.getAlias(), &errorCode);
if (subTableKey != nullptr) {
/*
ures_getByKeyWithFallback(table.getAlias(), subTableKey, subTable.getAlias(), &errorCode);
item = ures_getStringByKeyWithFallback(subTable.getAlias(), itemKey, pLength, &errorCode);
if(U_FAILURE(errorCode)){
*pErrorCode = errorCode;
}
break;*/
ures_getByKeyWithFallback(table.getAlias(), subTableKey, table.getAlias(), &errorCode);
}
if(U_SUCCESS(errorCode)){
item = ures_getStringByKeyWithFallback(table.getAlias(), itemKey, pLength, &errorCode);
if(U_FAILURE(errorCode)){
const char* replacement = nullptr;
*pErrorCode = errorCode; /*save the errorCode*/
errorCode = U_ZERO_ERROR;
/* may be a deprecated code */
if(uprv_strcmp(tableKey, "Countries")==0){
replacement = uloc_getCurrentCountryID(itemKey);
}else if(uprv_strcmp(tableKey, "Languages")==0){
replacement = uloc_getCurrentLanguageID(itemKey);
}
/*pointer comparison is ok since uloc_getCurrentCountryID & uloc_getCurrentLanguageID return the key itself is replacement is not found*/
if(replacement!=nullptr && itemKey != replacement){
item = ures_getStringByKeyWithFallback(table.getAlias(), replacement, pLength, &errorCode);
if(U_SUCCESS(errorCode)){
*pErrorCode = errorCode;
break;
}
}
}else{
break;
}
}
if(U_FAILURE(errorCode)){
/* still can't figure out ?.. try the fallback mechanism */
int32_t len = 0;
const char16_t* fallbackLocale = nullptr;
*pErrorCode = errorCode;
errorCode = U_ZERO_ERROR;
fallbackLocale = ures_getStringByKeyWithFallback(table.getAlias(), "Fallback", &len, &errorCode);
if(U_FAILURE(errorCode)){
*pErrorCode = errorCode;
break;
}
icu::CharString explicitFallbackName;
explicitFallbackName.appendInvariantChars(fallbackLocale, len, errorCode);
/* guard against recursive fallback */
if (explicitFallbackName == locale) {
*pErrorCode = U_INTERNAL_PROGRAM_ERROR;
break;
}
rb.adoptInstead(ures_open(path, explicitFallbackName.data(), &errorCode));
if(U_FAILURE(errorCode)){
*pErrorCode = errorCode;
break;
}
/* succeeded in opening the fallback bundle .. continue and try to fetch the item */
}else{
break;
}
}
return item;
}
namespace {
ULayoutType
_uloc_getOrientationHelper(const char* localeId,
const char* key,
UErrorCode& status)
{
ULayoutType result = ULOC_LAYOUT_UNKNOWN;
if (U_FAILURE(status)) { return result; }
icu::CharString localeBuffer = ulocimp_canonicalize(localeId, status);
if (U_FAILURE(status)) { return result; }
int32_t length = 0;
const char16_t* const value =
uloc_getTableStringWithFallback(
nullptr,
localeBuffer.data(),
"layout",
nullptr,
key,
&length,
&status);
if (U_FAILURE(status)) { return result; }
if (length != 0) {
switch(value[0])
{
case 0x0062: /* 'b' */
result = ULOC_LAYOUT_BTT;
break;
case 0x006C: /* 'l' */
result = ULOC_LAYOUT_LTR;
break;
case 0x0072: /* 'r' */
result = ULOC_LAYOUT_RTL;
break;
case 0x0074: /* 't' */
result = ULOC_LAYOUT_TTB;
break;
default:
status = U_INTERNAL_PROGRAM_ERROR;
break;
}
}
return result;
}
} // namespace
U_CAPI ULayoutType U_EXPORT2
uloc_getCharacterOrientation(const char* localeId,
UErrorCode *status)
{
return _uloc_getOrientationHelper(localeId, "characters", *status);
}
/**
* Get the layout line orientation for the specified locale.
*
* @param localeID locale name
* @param status Error status
* @return an enum indicating the layout orientation for lines.
*/
U_CAPI ULayoutType U_EXPORT2
uloc_getLineOrientation(const char* localeId,
UErrorCode *status)
{
return _uloc_getOrientationHelper(localeId, "lines", *status);
}

View file

@ -0,0 +1,276 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
* Copyright (C) 2002-2014, International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/
#include "unicode/utypes.h"
#if !UCONFIG_NO_SERVICE || !UCONFIG_NO_TRANSLITERATION
#include "unicode/resbund.h"
#include "unicode/uenum.h"
#include "cmemory.h"
#include "ustrfmt.h"
#include "locutil.h"
#include "charstr.h"
#include "ucln_cmn.h"
#include "uassert.h"
#include "umutex.h"
// see LocaleUtility::getAvailableLocaleNames
static icu::UInitOnce LocaleUtilityInitOnce {};
static icu::Hashtable * LocaleUtility_cache = nullptr;
#define UNDERSCORE_CHAR ((char16_t)0x005f)
#define AT_SIGN_CHAR ((char16_t)64)
#define PERIOD_CHAR ((char16_t)46)
/*
******************************************************************
*/
/**
* Release all static memory held by Locale Utility.
*/
U_CDECL_BEGIN
static UBool U_CALLCONV service_cleanup() {
if (LocaleUtility_cache) {
delete LocaleUtility_cache;
LocaleUtility_cache = nullptr;
}
return true;
}
static void U_CALLCONV locale_utility_init(UErrorCode &status) {
using namespace icu;
U_ASSERT(LocaleUtility_cache == nullptr);
ucln_common_registerCleanup(UCLN_COMMON_SERVICE, service_cleanup);
LocaleUtility_cache = new Hashtable(status);
if (U_FAILURE(status)) {
delete LocaleUtility_cache;
LocaleUtility_cache = nullptr;
return;
}
if (LocaleUtility_cache == nullptr) {
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
LocaleUtility_cache->setValueDeleter(uhash_deleteHashtable);
}
U_CDECL_END
U_NAMESPACE_BEGIN
UnicodeString&
LocaleUtility::canonicalLocaleString(const UnicodeString* id, UnicodeString& result)
{
if (id == nullptr) {
result.setToBogus();
} else {
// Fix case only (no other changes) up to the first '@' or '.' or
// end of string, whichever comes first. In 3.0 I changed this to
// stop at first '@' or '.'. It used to run out to the end of
// string. My fix makes the tests pass but is probably
// structurally incorrect. See below. [alan 3.0]
// TODO: Doug, you might want to revise this...
result = *id;
int32_t i = 0;
int32_t end = result.indexOf(AT_SIGN_CHAR);
int32_t n = result.indexOf(PERIOD_CHAR);
if (n >= 0 && n < end) {
end = n;
}
if (end < 0) {
end = result.length();
}
n = result.indexOf(UNDERSCORE_CHAR);
if (n < 0) {
n = end;
}
for (; i < n; ++i) {
char16_t c = result.charAt(i);
if (c >= 0x0041 && c <= 0x005a) {
c += 0x20;
result.setCharAt(i, c);
}
}
for (n = end; i < n; ++i) {
char16_t c = result.charAt(i);
if (c >= 0x0061 && c <= 0x007a) {
c -= 0x20;
result.setCharAt(i, c);
}
}
}
return result;
#if 0
// This code does a proper full level 2 canonicalization of id.
// It's nasty to go from char16_t to char to char to char16_t -- but
// that's what you have to do to use the uloc_canonicalize
// function on UnicodeStrings.
// I ended up doing the alternate fix (see above) not for
// performance reasons, although performance will certainly be
// better, but because doing a full level 2 canonicalization
// causes some tests to fail. [alan 3.0]
// TODO: Doug, you might want to revisit this...
result.setToBogus();
if (id != 0) {
int32_t buflen = id->length() + 8; // space for NUL
char* buf = (char*) uprv_malloc(buflen);
char* canon = (buf == 0) ? 0 : (char*) uprv_malloc(buflen);
if (buf != 0 && canon != 0) {
U_ASSERT(id->extract(0, INT32_MAX, buf, buflen) < buflen);
UErrorCode ec = U_ZERO_ERROR;
uloc_canonicalize(buf, canon, buflen, &ec);
if (U_SUCCESS(ec)) {
result = UnicodeString(canon);
}
}
uprv_free(buf);
uprv_free(canon);
}
return result;
#endif
}
Locale&
LocaleUtility::initLocaleFromName(const UnicodeString& id, Locale& result)
{
if (id.isBogus()) {
result.setToBogus();
} else {
/*
* We need to convert from a UnicodeString to char * in order to
* create a Locale.
*
* Problem: Locale ID strings may contain '@' which is a variant
* character and cannot be handled by invariant-character conversion.
*
* Hack: Since ICU code can handle locale IDs with multiple encodings
* of '@' (at least for EBCDIC; it's not known to be a problem for
* ASCII-based systems),
* we use regular invariant-character conversion for everything else
* and manually convert U+0040 into a compiler-char-constant '@'.
* While this compilation-time constant may not match the runtime
* encoding of '@', it should be one of the encodings which ICU
* recognizes.
*
* There should be only at most one '@' in a locale ID.
*/
CharString buffer;
int32_t prev, i;
prev = 0;
UErrorCode status = U_ZERO_ERROR;
do {
i = id.indexOf((char16_t)0x40, prev);
if(i < 0) {
// no @ between prev and the rest of the string
buffer.appendInvariantChars(id.tempSubString(prev), status);
break; // done
} else {
// normal invariant-character conversion for text between @s
buffer.appendInvariantChars(id.tempSubString(prev, i - prev), status);
// manually "convert" U+0040 at id[i] into '@' at buffer[i]
buffer.append('@', status);
prev = i + 1;
}
} while (U_SUCCESS(status));
if (U_FAILURE(status)) {
result.setToBogus();
} else {
result = Locale::createFromName(buffer.data());
}
}
return result;
}
UnicodeString&
LocaleUtility::initNameFromLocale(const Locale& locale, UnicodeString& result)
{
if (locale.isBogus()) {
result.setToBogus();
} else {
result.append(UnicodeString(locale.getName(), -1, US_INV));
}
return result;
}
const Hashtable*
LocaleUtility::getAvailableLocaleNames(const UnicodeString& bundleID)
{
// LocaleUtility_cache is a hash-of-hashes. The top-level keys
// are path strings ('bundleID') passed to
// ures_openAvailableLocales. The top-level values are
// second-level hashes. The second-level keys are result strings
// from ures_openAvailableLocales. The second-level values are
// garbage ((void*)1 or other random pointer).
UErrorCode status = U_ZERO_ERROR;
umtx_initOnce(LocaleUtilityInitOnce, locale_utility_init, status);
Hashtable *cache = LocaleUtility_cache;
if (cache == nullptr) {
// Catastrophic failure.
return nullptr;
}
Hashtable* htp;
umtx_lock(nullptr);
htp = (Hashtable*) cache->get(bundleID);
umtx_unlock(nullptr);
if (htp == nullptr) {
htp = new Hashtable(status);
if (htp && U_SUCCESS(status)) {
CharString cbundleID;
cbundleID.appendInvariantChars(bundleID, status);
const char* path = cbundleID.isEmpty() ? nullptr : cbundleID.data();
icu::LocalUEnumerationPointer uenum(ures_openAvailableLocales(path, &status));
for (;;) {
const char16_t* id = uenum_unext(uenum.getAlias(), nullptr, &status);
if (id == nullptr) {
break;
}
htp->put(UnicodeString(id), (void*)htp, status);
}
if (U_FAILURE(status)) {
delete htp;
return nullptr;
}
umtx_lock(nullptr);
Hashtable *t = static_cast<Hashtable *>(cache->get(bundleID));
if (t != nullptr) {
// Another thread raced through this code, creating the cache entry first.
// Discard ours and return theirs.
umtx_unlock(nullptr);
delete htp;
htp = t;
} else {
cache->put(bundleID, (void*)htp, status);
umtx_unlock(nullptr);
}
}
}
return htp;
}
bool
LocaleUtility::isFallbackOf(const UnicodeString& root, const UnicodeString& child)
{
return child.indexOf(root) == 0 &&
(child.length() == root.length() ||
child.charAt(root.length()) == UNDERSCORE_CHAR);
}
U_NAMESPACE_END
/* !UCONFIG_NO_SERVICE */
#endif

View file

@ -0,0 +1,39 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/**
*******************************************************************************
* Copyright (C) 2002-2005, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
*******************************************************************************
*/
#ifndef LOCUTIL_H
#define LOCUTIL_H
#include "unicode/utypes.h"
#include "hash.h"
#if !UCONFIG_NO_SERVICE || !UCONFIG_NO_TRANSLITERATION
U_NAMESPACE_BEGIN
// temporary utility functions, till I know where to find them
// in header so tests can also access them
class U_COMMON_API LocaleUtility {
public:
static UnicodeString& canonicalLocaleString(const UnicodeString* id, UnicodeString& result);
static Locale& initLocaleFromName(const UnicodeString& id, Locale& result);
static UnicodeString& initNameFromLocale(const Locale& locale, UnicodeString& result);
static const Hashtable* getAvailableLocaleNames(const UnicodeString& bundleID);
static bool isFallbackOf(const UnicodeString& root, const UnicodeString& child);
};
U_NAMESPACE_END
#endif
#endif

134
engine/thirdparty/icu4c/common/lsr.cpp vendored Normal file
View file

@ -0,0 +1,134 @@
// © 2019 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
// lsr.cpp
// created: 2019may08 Markus W. Scherer
#include "unicode/utypes.h"
#include "charstr.h"
#include "cmemory.h"
#include "cstring.h"
#include "lsr.h"
#include "uinvchar.h"
#include "ustr_imp.h"
U_NAMESPACE_BEGIN
LSR::LSR(char prefix, const char *lang, const char *scr, const char *r, int32_t f,
UErrorCode &errorCode) :
language(nullptr), script(nullptr), region(r),
regionIndex(indexForRegion(region)), flags(f) {
if (U_SUCCESS(errorCode)) {
CharString langScript;
langScript.append(prefix, errorCode).append(lang, errorCode).append('\0', errorCode);
int32_t scriptOffset = langScript.length();
langScript.append(prefix, errorCode).append(scr, errorCode);
owned = langScript.cloneData(errorCode);
if (U_SUCCESS(errorCode)) {
language = owned;
script = owned + scriptOffset;
}
}
}
LSR::LSR(StringPiece lang, StringPiece scr, StringPiece r, int32_t f,
UErrorCode &errorCode) :
language(nullptr), script(nullptr), region(nullptr),
regionIndex(indexForRegion(r.data())), flags(f) {
if (U_SUCCESS(errorCode)) {
CharString data;
data.append(lang, errorCode).append('\0', errorCode);
int32_t scriptOffset = data.length();
data.append(scr, errorCode).append('\0', errorCode);
int32_t regionOffset = data.length();
data.append(r, errorCode);
owned = data.cloneData(errorCode);
if (U_SUCCESS(errorCode)) {
language = owned;
script = owned + scriptOffset;
region = owned + regionOffset;
}
}
}
LSR::LSR(LSR &&other) noexcept :
language(other.language), script(other.script), region(other.region), owned(other.owned),
regionIndex(other.regionIndex), flags(other.flags),
hashCode(other.hashCode) {
if (owned != nullptr) {
other.language = other.script = "";
other.owned = nullptr;
other.hashCode = 0;
}
}
void LSR::deleteOwned() {
uprv_free(owned);
}
LSR &LSR::operator=(LSR &&other) noexcept {
this->~LSR();
language = other.language;
script = other.script;
region = other.region;
regionIndex = other.regionIndex;
flags = other.flags;
owned = other.owned;
hashCode = other.hashCode;
if (owned != nullptr) {
other.language = other.script = "";
other.owned = nullptr;
other.hashCode = 0;
}
return *this;
}
UBool LSR::isEquivalentTo(const LSR &other) const {
return
uprv_strcmp(language, other.language) == 0 &&
uprv_strcmp(script, other.script) == 0 &&
regionIndex == other.regionIndex &&
// Compare regions if both are ill-formed (and their indexes are 0).
(regionIndex > 0 || uprv_strcmp(region, other.region) == 0);
}
bool LSR::operator==(const LSR &other) const {
return
uprv_strcmp(language, other.language) == 0 &&
uprv_strcmp(script, other.script) == 0 &&
regionIndex == other.regionIndex &&
// Compare regions if both are ill-formed (and their indexes are 0).
(regionIndex > 0 || uprv_strcmp(region, other.region) == 0) &&
flags == other.flags;
}
int32_t LSR::indexForRegion(const char *region) {
int32_t c = region[0];
int32_t a = c - '0';
if (0 <= a && a <= 9) { // digits: "419"
int32_t b = region[1] - '0';
if (b < 0 || 9 < b) { return 0; }
c = region[2] - '0';
if (c < 0 || 9 < c || region[3] != 0) { return 0; }
return (10 * a + b) * 10 + c + 1;
} else { // letters: "DE"
a = uprv_upperOrdinal(c);
if (a < 0 || 25 < a) { return 0; }
int32_t b = uprv_upperOrdinal(region[1]);
if (b < 0 || 25 < b || region[2] != 0) { return 0; }
return 26 * a + b + 1001;
}
return 0;
}
LSR &LSR::setHashCode() {
if (hashCode == 0) {
uint32_t h = ustr_hashCharsN(language, static_cast<int32_t>(uprv_strlen(language)));
h = h * 37 + ustr_hashCharsN(script, static_cast<int32_t>(uprv_strlen(script)));
h = h * 37 + regionIndex;
hashCode = h * 37 + flags;
}
return *this;
}
U_NAMESPACE_END

85
engine/thirdparty/icu4c/common/lsr.h vendored Normal file
View file

@ -0,0 +1,85 @@
// © 2019 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
// lsr.h
// created: 2019may08 Markus W. Scherer
#ifndef __LSR_H__
#define __LSR_H__
#include "unicode/stringpiece.h"
#include "unicode/utypes.h"
#include "unicode/uobject.h"
#include "cstring.h"
U_NAMESPACE_BEGIN
struct LSR final : public UMemory {
static constexpr int32_t REGION_INDEX_LIMIT = 1001 + 26 * 26;
static constexpr int32_t EXPLICIT_LSR = 7;
static constexpr int32_t EXPLICIT_LANGUAGE = 4;
static constexpr int32_t EXPLICIT_SCRIPT = 2;
static constexpr int32_t EXPLICIT_REGION = 1;
static constexpr int32_t IMPLICIT_LSR = 0;
static constexpr int32_t DONT_CARE_FLAGS = 0;
const char *language;
const char *script;
const char *region;
char *owned = nullptr;
/** Index for region, 0 if ill-formed. @see indexForRegion */
int32_t regionIndex = 0;
int32_t flags = 0;
/** Only set for LSRs that will be used in a hash table. */
int32_t hashCode = 0;
LSR() : language("und"), script(""), region("") {}
/** Constructor which aliases all subtag pointers. */
LSR(const char *lang, const char *scr, const char *r, int32_t f) :
language(lang), script(scr), region(r),
regionIndex(indexForRegion(region)), flags(f) {}
/**
* Constructor which prepends the prefix to the language and script,
* copies those into owned memory, and aliases the region.
*/
LSR(char prefix, const char *lang, const char *scr, const char *r, int32_t f,
UErrorCode &errorCode);
LSR(StringPiece lang, StringPiece scr, StringPiece r, int32_t f,
UErrorCode &errorCode);
LSR(LSR &&other) noexcept;
LSR(const LSR &other) = delete;
inline ~LSR() {
// Pure inline code for almost all instances.
if (owned != nullptr) {
deleteOwned();
}
}
LSR &operator=(LSR &&other) noexcept;
LSR &operator=(const LSR &other) = delete;
/**
* Returns a positive index (>0) for a well-formed region code.
* Do not rely on a particular region->index mapping; it may change.
* Returns 0 for ill-formed strings.
*/
static int32_t indexForRegion(const char *region);
UBool isEquivalentTo(const LSR &other) const;
bool operator==(const LSR &other) const;
inline bool operator!=(const LSR &other) const {
return !operator==(other);
}
LSR &setHashCode();
private:
void deleteOwned();
};
U_NAMESPACE_END
#endif // __LSR_H__

View file

@ -0,0 +1,856 @@
// © 2021 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
#include <complex>
#include <utility>
#include "unicode/utypes.h"
#if !UCONFIG_NO_BREAK_ITERATION
#include "brkeng.h"
#include "charstr.h"
#include "cmemory.h"
#include "lstmbe.h"
#include "putilimp.h"
#include "uassert.h"
#include "ubrkimpl.h"
#include "uresimp.h"
#include "uvectr32.h"
#include "uvector.h"
#include "unicode/brkiter.h"
#include "unicode/resbund.h"
#include "unicode/ubrk.h"
#include "unicode/uniset.h"
#include "unicode/ustring.h"
#include "unicode/utf.h"
U_NAMESPACE_BEGIN
// Uncomment the following #define to debug.
// #define LSTM_DEBUG 1
// #define LSTM_VECTORIZER_DEBUG 1
/**
* Interface for reading 1D array.
*/
class ReadArray1D {
public:
virtual ~ReadArray1D();
virtual int32_t d1() const = 0;
virtual float get(int32_t i) const = 0;
#ifdef LSTM_DEBUG
void print() const {
printf("\n[");
for (int32_t i = 0; i < d1(); i++) {
printf("%0.8e ", get(i));
if (i % 4 == 3) printf("\n");
}
printf("]\n");
}
#endif
};
ReadArray1D::~ReadArray1D()
{
}
/**
* Interface for reading 2D array.
*/
class ReadArray2D {
public:
virtual ~ReadArray2D();
virtual int32_t d1() const = 0;
virtual int32_t d2() const = 0;
virtual float get(int32_t i, int32_t j) const = 0;
};
ReadArray2D::~ReadArray2D()
{
}
/**
* A class to index a float array as a 1D Array without owning the pointer or
* copy the data.
*/
class ConstArray1D : public ReadArray1D {
public:
ConstArray1D() : data_(nullptr), d1_(0) {}
ConstArray1D(const float* data, int32_t d1) : data_(data), d1_(d1) {}
virtual ~ConstArray1D();
// Init the object, the object does not own the data nor copy.
// It is designed to directly use data from memory mapped resources.
void init(const int32_t* data, int32_t d1) {
U_ASSERT(IEEE_754 == 1);
data_ = reinterpret_cast<const float*>(data);
d1_ = d1;
}
// ReadArray1D methods.
virtual int32_t d1() const override { return d1_; }
virtual float get(int32_t i) const override {
U_ASSERT(i < d1_);
return data_[i];
}
private:
const float* data_;
int32_t d1_;
};
ConstArray1D::~ConstArray1D()
{
}
/**
* A class to index a float array as a 2D Array without owning the pointer or
* copy the data.
*/
class ConstArray2D : public ReadArray2D {
public:
ConstArray2D() : data_(nullptr), d1_(0), d2_(0) {}
ConstArray2D(const float* data, int32_t d1, int32_t d2)
: data_(data), d1_(d1), d2_(d2) {}
virtual ~ConstArray2D();
// Init the object, the object does not own the data nor copy.
// It is designed to directly use data from memory mapped resources.
void init(const int32_t* data, int32_t d1, int32_t d2) {
U_ASSERT(IEEE_754 == 1);
data_ = reinterpret_cast<const float*>(data);
d1_ = d1;
d2_ = d2;
}
// ReadArray2D methods.
inline int32_t d1() const override { return d1_; }
inline int32_t d2() const override { return d2_; }
float get(int32_t i, int32_t j) const override {
U_ASSERT(i < d1_);
U_ASSERT(j < d2_);
return data_[i * d2_ + j];
}
// Expose the ith row as a ConstArray1D
inline ConstArray1D row(int32_t i) const {
U_ASSERT(i < d1_);
return ConstArray1D(data_ + i * d2_, d2_);
}
private:
const float* data_;
int32_t d1_;
int32_t d2_;
};
ConstArray2D::~ConstArray2D()
{
}
/**
* A class to allocate data as a writable 1D array.
* This is the main class implement matrix operation.
*/
class Array1D : public ReadArray1D {
public:
Array1D() : memory_(nullptr), data_(nullptr), d1_(0) {}
Array1D(int32_t d1, UErrorCode &status)
: memory_(uprv_malloc(d1 * sizeof(float))),
data_((float*)memory_), d1_(d1) {
if (U_SUCCESS(status)) {
if (memory_ == nullptr) {
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
clear();
}
}
virtual ~Array1D();
// A special constructor which does not own the memory but writeable
// as a slice of an array.
Array1D(float* data, int32_t d1)
: memory_(nullptr), data_(data), d1_(d1) {}
// ReadArray1D methods.
virtual int32_t d1() const override { return d1_; }
virtual float get(int32_t i) const override {
U_ASSERT(i < d1_);
return data_[i];
}
// Return the index which point to the max data in the array.
inline int32_t maxIndex() const {
int32_t index = 0;
float max = data_[0];
for (int32_t i = 1; i < d1_; i++) {
if (data_[i] > max) {
max = data_[i];
index = i;
}
}
return index;
}
// Slice part of the array to a new one.
inline Array1D slice(int32_t from, int32_t size) const {
U_ASSERT(from >= 0);
U_ASSERT(from < d1_);
U_ASSERT(from + size <= d1_);
return Array1D(data_ + from, size);
}
// Add dot product of a 1D array and a 2D array into this one.
inline Array1D& addDotProduct(const ReadArray1D& a, const ReadArray2D& b) {
U_ASSERT(a.d1() == b.d1());
U_ASSERT(b.d2() == d1());
for (int32_t i = 0; i < d1(); i++) {
for (int32_t j = 0; j < a.d1(); j++) {
data_[i] += a.get(j) * b.get(j, i);
}
}
return *this;
}
// Hadamard Product the values of another array of the same size into this one.
inline Array1D& hadamardProduct(const ReadArray1D& a) {
U_ASSERT(a.d1() == d1());
for (int32_t i = 0; i < d1(); i++) {
data_[i] *= a.get(i);
}
return *this;
}
// Add the Hadamard Product of two arrays of the same size into this one.
inline Array1D& addHadamardProduct(const ReadArray1D& a, const ReadArray1D& b) {
U_ASSERT(a.d1() == d1());
U_ASSERT(b.d1() == d1());
for (int32_t i = 0; i < d1(); i++) {
data_[i] += a.get(i) * b.get(i);
}
return *this;
}
// Add the values of another array of the same size into this one.
inline Array1D& add(const ReadArray1D& a) {
U_ASSERT(a.d1() == d1());
for (int32_t i = 0; i < d1(); i++) {
data_[i] += a.get(i);
}
return *this;
}
// Assign the values of another array of the same size into this one.
inline Array1D& assign(const ReadArray1D& a) {
U_ASSERT(a.d1() == d1());
for (int32_t i = 0; i < d1(); i++) {
data_[i] = a.get(i);
}
return *this;
}
// Apply tanh to all the elements in the array.
inline Array1D& tanh() {
return tanh(*this);
}
// Apply tanh of a and store into this array.
inline Array1D& tanh(const Array1D& a) {
U_ASSERT(a.d1() == d1());
for (int32_t i = 0; i < d1_; i++) {
data_[i] = std::tanh(a.get(i));
}
return *this;
}
// Apply sigmoid to all the elements in the array.
inline Array1D& sigmoid() {
for (int32_t i = 0; i < d1_; i++) {
data_[i] = 1.0f/(1.0f + expf(-data_[i]));
}
return *this;
}
inline Array1D& clear() {
uprv_memset(data_, 0, d1_ * sizeof(float));
return *this;
}
private:
void* memory_;
float* data_;
int32_t d1_;
};
Array1D::~Array1D()
{
uprv_free(memory_);
}
class Array2D : public ReadArray2D {
public:
Array2D() : memory_(nullptr), data_(nullptr), d1_(0), d2_(0) {}
Array2D(int32_t d1, int32_t d2, UErrorCode &status)
: memory_(uprv_malloc(d1 * d2 * sizeof(float))),
data_((float*)memory_), d1_(d1), d2_(d2) {
if (U_SUCCESS(status)) {
if (memory_ == nullptr) {
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
clear();
}
}
virtual ~Array2D();
// ReadArray2D methods.
virtual int32_t d1() const override { return d1_; }
virtual int32_t d2() const override { return d2_; }
virtual float get(int32_t i, int32_t j) const override {
U_ASSERT(i < d1_);
U_ASSERT(j < d2_);
return data_[i * d2_ + j];
}
inline Array1D row(int32_t i) const {
U_ASSERT(i < d1_);
return Array1D(data_ + i * d2_, d2_);
}
inline Array2D& clear() {
uprv_memset(data_, 0, d1_ * d2_ * sizeof(float));
return *this;
}
private:
void* memory_;
float* data_;
int32_t d1_;
int32_t d2_;
};
Array2D::~Array2D()
{
uprv_free(memory_);
}
typedef enum {
BEGIN,
INSIDE,
END,
SINGLE
} LSTMClass;
typedef enum {
UNKNOWN,
CODE_POINTS,
GRAPHEME_CLUSTER,
} EmbeddingType;
struct LSTMData : public UMemory {
LSTMData(UResourceBundle* rb, UErrorCode &status);
~LSTMData();
UHashtable* fDict;
EmbeddingType fType;
const char16_t* fName;
ConstArray2D fEmbedding;
ConstArray2D fForwardW;
ConstArray2D fForwardU;
ConstArray1D fForwardB;
ConstArray2D fBackwardW;
ConstArray2D fBackwardU;
ConstArray1D fBackwardB;
ConstArray2D fOutputW;
ConstArray1D fOutputB;
private:
UResourceBundle* fBundle;
};
LSTMData::LSTMData(UResourceBundle* rb, UErrorCode &status)
: fDict(nullptr), fType(UNKNOWN), fName(nullptr),
fBundle(rb)
{
if (U_FAILURE(status)) {
return;
}
if (IEEE_754 != 1) {
status = U_UNSUPPORTED_ERROR;
return;
}
LocalUResourceBundlePointer embeddings_res(
ures_getByKey(rb, "embeddings", nullptr, &status));
int32_t embedding_size = ures_getInt(embeddings_res.getAlias(), &status);
LocalUResourceBundlePointer hunits_res(
ures_getByKey(rb, "hunits", nullptr, &status));
if (U_FAILURE(status)) return;
int32_t hunits = ures_getInt(hunits_res.getAlias(), &status);
const char16_t* type = ures_getStringByKey(rb, "type", nullptr, &status);
if (U_FAILURE(status)) return;
if (u_strCompare(type, -1, u"codepoints", -1, false) == 0) {
fType = CODE_POINTS;
} else if (u_strCompare(type, -1, u"graphclust", -1, false) == 0) {
fType = GRAPHEME_CLUSTER;
}
fName = ures_getStringByKey(rb, "model", nullptr, &status);
LocalUResourceBundlePointer dataRes(ures_getByKey(rb, "data", nullptr, &status));
if (U_FAILURE(status)) return;
int32_t data_len = 0;
const int32_t* data = ures_getIntVector(dataRes.getAlias(), &data_len, &status);
fDict = uhash_open(uhash_hashUChars, uhash_compareUChars, nullptr, &status);
StackUResourceBundle stackTempBundle;
ResourceDataValue value;
ures_getValueWithFallback(rb, "dict", stackTempBundle.getAlias(), value, status);
ResourceArray stringArray = value.getArray(status);
int32_t num_index = stringArray.getSize();
if (U_FAILURE(status)) { return; }
// put dict into hash
int32_t stringLength;
for (int32_t idx = 0; idx < num_index; idx++) {
stringArray.getValue(idx, value);
const char16_t* str = value.getString(stringLength, status);
uhash_putiAllowZero(fDict, (void*)str, idx, &status);
if (U_FAILURE(status)) return;
#ifdef LSTM_VECTORIZER_DEBUG
printf("Assign [");
while (*str != 0x0000) {
printf("U+%04x ", *str);
str++;
}
printf("] map to %d\n", idx-1);
#endif
}
int32_t mat1_size = (num_index + 1) * embedding_size;
int32_t mat2_size = embedding_size * 4 * hunits;
int32_t mat3_size = hunits * 4 * hunits;
int32_t mat4_size = 4 * hunits;
int32_t mat5_size = mat2_size;
int32_t mat6_size = mat3_size;
int32_t mat7_size = mat4_size;
int32_t mat8_size = 2 * hunits * 4;
#if U_DEBUG
int32_t mat9_size = 4;
U_ASSERT(data_len == mat1_size + mat2_size + mat3_size + mat4_size + mat5_size +
mat6_size + mat7_size + mat8_size + mat9_size);
#endif
fEmbedding.init(data, (num_index + 1), embedding_size);
data += mat1_size;
fForwardW.init(data, embedding_size, 4 * hunits);
data += mat2_size;
fForwardU.init(data, hunits, 4 * hunits);
data += mat3_size;
fForwardB.init(data, 4 * hunits);
data += mat4_size;
fBackwardW.init(data, embedding_size, 4 * hunits);
data += mat5_size;
fBackwardU.init(data, hunits, 4 * hunits);
data += mat6_size;
fBackwardB.init(data, 4 * hunits);
data += mat7_size;
fOutputW.init(data, 2 * hunits, 4);
data += mat8_size;
fOutputB.init(data, 4);
}
LSTMData::~LSTMData() {
uhash_close(fDict);
ures_close(fBundle);
}
class Vectorizer : public UMemory {
public:
Vectorizer(UHashtable* dict) : fDict(dict) {}
virtual ~Vectorizer();
virtual void vectorize(UText *text, int32_t startPos, int32_t endPos,
UVector32 &offsets, UVector32 &indices,
UErrorCode &status) const = 0;
protected:
int32_t stringToIndex(const char16_t* str) const {
UBool found = false;
int32_t ret = uhash_getiAndFound(fDict, (const void*)str, &found);
if (!found) {
ret = fDict->count;
}
#ifdef LSTM_VECTORIZER_DEBUG
printf("[");
while (*str != 0x0000) {
printf("U+%04x ", *str);
str++;
}
printf("] map to %d\n", ret);
#endif
return ret;
}
private:
UHashtable* fDict;
};
Vectorizer::~Vectorizer()
{
}
class CodePointsVectorizer : public Vectorizer {
public:
CodePointsVectorizer(UHashtable* dict) : Vectorizer(dict) {}
virtual ~CodePointsVectorizer();
virtual void vectorize(UText *text, int32_t startPos, int32_t endPos,
UVector32 &offsets, UVector32 &indices,
UErrorCode &status) const override;
};
CodePointsVectorizer::~CodePointsVectorizer()
{
}
void CodePointsVectorizer::vectorize(
UText *text, int32_t startPos, int32_t endPos,
UVector32 &offsets, UVector32 &indices, UErrorCode &status) const
{
if (offsets.ensureCapacity(endPos - startPos, status) &&
indices.ensureCapacity(endPos - startPos, status)) {
if (U_FAILURE(status)) return;
utext_setNativeIndex(text, startPos);
int32_t current;
char16_t str[2] = {0, 0};
while (U_SUCCESS(status) &&
(current = (int32_t)utext_getNativeIndex(text)) < endPos) {
// Since the LSTMBreakEngine is currently only accept chars in BMP,
// we can ignore the possibility of hitting supplementary code
// point.
str[0] = (char16_t) utext_next32(text);
U_ASSERT(!U_IS_SURROGATE(str[0]));
offsets.addElement(current, status);
indices.addElement(stringToIndex(str), status);
}
}
}
class GraphemeClusterVectorizer : public Vectorizer {
public:
GraphemeClusterVectorizer(UHashtable* dict)
: Vectorizer(dict)
{
}
virtual ~GraphemeClusterVectorizer();
virtual void vectorize(UText *text, int32_t startPos, int32_t endPos,
UVector32 &offsets, UVector32 &indices,
UErrorCode &status) const override;
};
GraphemeClusterVectorizer::~GraphemeClusterVectorizer()
{
}
constexpr int32_t MAX_GRAPHEME_CLSTER_LENGTH = 10;
void GraphemeClusterVectorizer::vectorize(
UText *text, int32_t startPos, int32_t endPos,
UVector32 &offsets, UVector32 &indices, UErrorCode &status) const
{
if (U_FAILURE(status)) return;
if (!offsets.ensureCapacity(endPos - startPos, status) ||
!indices.ensureCapacity(endPos - startPos, status)) {
return;
}
if (U_FAILURE(status)) return;
LocalPointer<BreakIterator> graphemeIter(BreakIterator::createCharacterInstance(Locale(), status));
if (U_FAILURE(status)) return;
graphemeIter->setText(text, status);
if (U_FAILURE(status)) return;
if (startPos != 0) {
graphemeIter->preceding(startPos);
}
int32_t last = startPos;
int32_t current = startPos;
char16_t str[MAX_GRAPHEME_CLSTER_LENGTH];
while ((current = graphemeIter->next()) != BreakIterator::DONE) {
if (current >= endPos) {
break;
}
if (current > startPos) {
utext_extract(text, last, current, str, MAX_GRAPHEME_CLSTER_LENGTH, &status);
if (U_FAILURE(status)) return;
offsets.addElement(last, status);
indices.addElement(stringToIndex(str), status);
if (U_FAILURE(status)) return;
}
last = current;
}
if (U_FAILURE(status) || last >= endPos) {
return;
}
utext_extract(text, last, endPos, str, MAX_GRAPHEME_CLSTER_LENGTH, &status);
if (U_SUCCESS(status)) {
offsets.addElement(last, status);
indices.addElement(stringToIndex(str), status);
}
}
// Computing LSTM as stated in
// https://en.wikipedia.org/wiki/Long_short-term_memory#LSTM_with_a_forget_gate
// ifco is temp array allocate outside which does not need to be
// input/output value but could avoid unnecessary memory alloc/free if passing
// in.
void compute(
int32_t hunits,
const ReadArray2D& W, const ReadArray2D& U, const ReadArray1D& b,
const ReadArray1D& x, Array1D& h, Array1D& c,
Array1D& ifco)
{
// ifco = x * W + h * U + b
ifco.assign(b)
.addDotProduct(x, W)
.addDotProduct(h, U);
ifco.slice(0*hunits, hunits).sigmoid(); // i: sigmod
ifco.slice(1*hunits, hunits).sigmoid(); // f: sigmoid
ifco.slice(2*hunits, hunits).tanh(); // c_: tanh
ifco.slice(3*hunits, hunits).sigmoid(); // o: sigmod
c.hadamardProduct(ifco.slice(hunits, hunits))
.addHadamardProduct(ifco.slice(0, hunits), ifco.slice(2*hunits, hunits));
h.tanh(c)
.hadamardProduct(ifco.slice(3*hunits, hunits));
}
// Minimum word size
static const int32_t MIN_WORD = 2;
// Minimum number of characters for two words
static const int32_t MIN_WORD_SPAN = MIN_WORD * 2;
int32_t
LSTMBreakEngine::divideUpDictionaryRange( UText *text,
int32_t startPos,
int32_t endPos,
UVector32 &foundBreaks,
UBool /* isPhraseBreaking */,
UErrorCode& status) const {
if (U_FAILURE(status)) return 0;
int32_t beginFoundBreakSize = foundBreaks.size();
utext_setNativeIndex(text, startPos);
utext_moveIndex32(text, MIN_WORD_SPAN);
if (utext_getNativeIndex(text) >= endPos) {
return 0; // Not enough characters for two words
}
utext_setNativeIndex(text, startPos);
UVector32 offsets(status);
UVector32 indices(status);
if (U_FAILURE(status)) return 0;
fVectorizer->vectorize(text, startPos, endPos, offsets, indices, status);
if (U_FAILURE(status)) return 0;
int32_t* offsetsBuf = offsets.getBuffer();
int32_t* indicesBuf = indices.getBuffer();
int32_t input_seq_len = indices.size();
int32_t hunits = fData->fForwardU.d1();
// ----- Begin of all the Array memory allocation needed for this function
// Allocate temp array used inside compute()
Array1D ifco(4 * hunits, status);
Array1D c(hunits, status);
Array1D logp(4, status);
// TODO: limit size of hBackward. If input_seq_len is too big, we could
// run out of memory.
// Backward LSTM
Array2D hBackward(input_seq_len, hunits, status);
// Allocate fbRow and slice the internal array in two.
Array1D fbRow(2 * hunits, status);
// ----- End of all the Array memory allocation needed for this function
if (U_FAILURE(status)) return 0;
// To save the needed memory usage, the following is different from the
// Python or ICU4X implementation. We first perform the Backward LSTM
// and then merge the iteration of the forward LSTM and the output layer
// together because we only neetdto remember the h[t-1] for Forward LSTM.
for (int32_t i = input_seq_len - 1; i >= 0; i--) {
Array1D hRow = hBackward.row(i);
if (i != input_seq_len - 1) {
hRow.assign(hBackward.row(i+1));
}
#ifdef LSTM_DEBUG
printf("hRow %d\n", i);
hRow.print();
printf("indicesBuf[%d] = %d\n", i, indicesBuf[i]);
printf("fData->fEmbedding.row(indicesBuf[%d]):\n", i);
fData->fEmbedding.row(indicesBuf[i]).print();
#endif // LSTM_DEBUG
compute(hunits,
fData->fBackwardW, fData->fBackwardU, fData->fBackwardB,
fData->fEmbedding.row(indicesBuf[i]),
hRow, c, ifco);
}
Array1D forwardRow = fbRow.slice(0, hunits); // point to first half of data in fbRow.
Array1D backwardRow = fbRow.slice(hunits, hunits); // point to second half of data n fbRow.
// The following iteration merge the forward LSTM and the output layer
// together.
c.clear(); // reuse c since it is the same size.
for (int32_t i = 0; i < input_seq_len; i++) {
#ifdef LSTM_DEBUG
printf("forwardRow %d\n", i);
forwardRow.print();
#endif // LSTM_DEBUG
// Forward LSTM
// Calculate the result into forwardRow, which point to the data in the first half
// of fbRow.
compute(hunits,
fData->fForwardW, fData->fForwardU, fData->fForwardB,
fData->fEmbedding.row(indicesBuf[i]),
forwardRow, c, ifco);
// assign the data from hBackward.row(i) to second half of fbRowa.
backwardRow.assign(hBackward.row(i));
logp.assign(fData->fOutputB).addDotProduct(fbRow, fData->fOutputW);
#ifdef LSTM_DEBUG
printf("backwardRow %d\n", i);
backwardRow.print();
printf("logp %d\n", i);
logp.print();
#endif // LSTM_DEBUG
// current = argmax(logp)
LSTMClass current = (LSTMClass)logp.maxIndex();
// BIES logic.
if (current == BEGIN || current == SINGLE) {
if (i != 0) {
foundBreaks.addElement(offsetsBuf[i], status);
if (U_FAILURE(status)) return 0;
}
}
}
return foundBreaks.size() - beginFoundBreakSize;
}
Vectorizer* createVectorizer(const LSTMData* data, UErrorCode &status) {
if (U_FAILURE(status)) {
return nullptr;
}
switch (data->fType) {
case CODE_POINTS:
return new CodePointsVectorizer(data->fDict);
break;
case GRAPHEME_CLUSTER:
return new GraphemeClusterVectorizer(data->fDict);
break;
default:
break;
}
UPRV_UNREACHABLE_EXIT;
}
LSTMBreakEngine::LSTMBreakEngine(const LSTMData* data, const UnicodeSet& set, UErrorCode &status)
: DictionaryBreakEngine(), fData(data), fVectorizer(createVectorizer(fData, status))
{
if (U_FAILURE(status)) {
fData = nullptr; // If failure, we should not delete fData in destructor because the caller will do so.
return;
}
setCharacters(set);
}
LSTMBreakEngine::~LSTMBreakEngine() {
delete fData;
delete fVectorizer;
}
const char16_t* LSTMBreakEngine::name() const {
return fData->fName;
}
UnicodeString defaultLSTM(UScriptCode script, UErrorCode& status) {
// open root from brkitr tree.
UResourceBundle *b = ures_open(U_ICUDATA_BRKITR, "", &status);
b = ures_getByKeyWithFallback(b, "lstm", b, &status);
UnicodeString result = ures_getUnicodeStringByKey(b, uscript_getShortName(script), &status);
ures_close(b);
return result;
}
U_CAPI const LSTMData* U_EXPORT2 CreateLSTMDataForScript(UScriptCode script, UErrorCode& status)
{
if (script != USCRIPT_KHMER && script != USCRIPT_LAO && script != USCRIPT_MYANMAR && script != USCRIPT_THAI) {
return nullptr;
}
UnicodeString name = defaultLSTM(script, status);
if (U_FAILURE(status)) return nullptr;
CharString namebuf;
namebuf.appendInvariantChars(name, status).truncate(namebuf.lastIndexOf('.'));
LocalUResourceBundlePointer rb(
ures_openDirect(U_ICUDATA_BRKITR, namebuf.data(), &status));
if (U_FAILURE(status)) return nullptr;
return CreateLSTMData(rb.orphan(), status);
}
U_CAPI const LSTMData* U_EXPORT2 CreateLSTMData(UResourceBundle* rb, UErrorCode& status)
{
return new LSTMData(rb, status);
}
U_CAPI const LanguageBreakEngine* U_EXPORT2
CreateLSTMBreakEngine(UScriptCode script, const LSTMData* data, UErrorCode& status)
{
UnicodeString unicodeSetString;
switch(script) {
case USCRIPT_THAI:
unicodeSetString = UnicodeString(u"[[:Thai:]&[:LineBreak=SA:]]");
break;
case USCRIPT_MYANMAR:
unicodeSetString = UnicodeString(u"[[:Mymr:]&[:LineBreak=SA:]]");
break;
default:
delete data;
return nullptr;
}
UnicodeSet unicodeSet;
unicodeSet.applyPattern(unicodeSetString, status);
const LanguageBreakEngine* engine = new LSTMBreakEngine(data, unicodeSet, status);
if (U_FAILURE(status) || engine == nullptr) {
if (engine != nullptr) {
delete engine;
} else {
status = U_MEMORY_ALLOCATION_ERROR;
}
return nullptr;
}
return engine;
}
U_CAPI void U_EXPORT2 DeleteLSTMData(const LSTMData* data)
{
delete data;
}
U_CAPI const char16_t* U_EXPORT2 LSTMDataName(const LSTMData* data)
{
return data->fName;
}
U_NAMESPACE_END
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */

88
engine/thirdparty/icu4c/common/lstmbe.h vendored Normal file
View file

@ -0,0 +1,88 @@
// © 2021 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
#ifndef LSTMBE_H
#define LSTMBE_H
#include "unicode/utypes.h"
#if !UCONFIG_NO_BREAK_ITERATION
#include "unicode/uniset.h"
#include "unicode/ures.h"
#include "unicode/utext.h"
#include "unicode/utypes.h"
#include "brkeng.h"
#include "dictbe.h"
#include "uvectr32.h"
U_NAMESPACE_BEGIN
class Vectorizer;
struct LSTMData;
/*******************************************************************
* LSTMBreakEngine
*/
/**
* <p>LSTMBreakEngine is a kind of DictionaryBreakEngine that uses a
* LSTM to determine language-specific breaks.</p>
*
* <p>After it is constructed a LSTMBreakEngine may be shared between
* threads without synchronization.</p>
*/
class LSTMBreakEngine : public DictionaryBreakEngine {
public:
/**
* <p>Constructor.</p>
*/
LSTMBreakEngine(const LSTMData* data, const UnicodeSet& set, UErrorCode &status);
/**
* <p>Virtual destructor.</p>
*/
virtual ~LSTMBreakEngine();
virtual const char16_t* name() const;
protected:
/**
* <p>Divide up a range of known dictionary characters handled by this break engine.</p>
*
* @param text A UText representing the text
* @param rangeStart The start of the range of dictionary characters
* @param rangeEnd The end of the range of dictionary characters
* @param foundBreaks Output of C array of int32_t break positions, or 0
* @param status Information on any errors encountered.
* @return The number of breaks found
*/
virtual int32_t divideUpDictionaryRange(UText *text,
int32_t rangeStart,
int32_t rangeEnd,
UVector32 &foundBreaks,
UBool isPhraseBreaking,
UErrorCode& status) const override;
private:
const LSTMData* fData;
const Vectorizer* fVectorizer;
};
U_CAPI const LanguageBreakEngine* U_EXPORT2 CreateLSTMBreakEngine(
UScriptCode script, const LSTMData* data, UErrorCode& status);
U_CAPI const LSTMData* U_EXPORT2 CreateLSTMData(
UResourceBundle* rb, UErrorCode& status);
U_CAPI const LSTMData* U_EXPORT2 CreateLSTMDataForScript(
UScriptCode script, UErrorCode& status);
U_CAPI void U_EXPORT2 DeleteLSTMData(const LSTMData* data);
U_CAPI const char16_t* U_EXPORT2 LSTMDataName(const LSTMData* data);
U_NAMESPACE_END
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
#endif /* LSTMBE_H */

View file

@ -0,0 +1,65 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
* Copyright (C) 2011, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* file name: messageimpl.h
* encoding: UTF-8
* tab size: 8 (not used)
* indentation:4
*
* created on: 2011apr04
* created by: Markus W. Scherer
*/
#ifndef __MESSAGEIMPL_H__
#define __MESSAGEIMPL_H__
#include "unicode/utypes.h"
#if !UCONFIG_NO_FORMATTING
#include "unicode/messagepattern.h"
U_NAMESPACE_BEGIN
/**
* Helper functions for use of MessagePattern.
* In Java, these are package-private methods in MessagePattern itself.
* In C++, they are declared here and implemented in messagepattern.cpp.
*/
class U_COMMON_API MessageImpl {
public:
/**
* @return true if getApostropheMode()==UMSGPAT_APOS_DOUBLE_REQUIRED
*/
static UBool jdkAposMode(const MessagePattern &msgPattern) {
return msgPattern.getApostropheMode()==UMSGPAT_APOS_DOUBLE_REQUIRED;
}
/**
* Appends the s[start, limit[ substring to sb, but with only half of the apostrophes
* according to JDK pattern behavior.
*/
static void appendReducedApostrophes(const UnicodeString &s, int32_t start, int32_t limit,
UnicodeString &sb);
/**
* Appends the sub-message to the result string.
* Omits SKIP_SYNTAX and appends whole arguments using appendReducedApostrophes().
*/
static UnicodeString &appendSubMessageWithoutSkipSyntax(const MessagePattern &msgPattern,
int32_t msgStart,
UnicodeString &result);
private:
MessageImpl() = delete; // no constructor: all static methods
};
U_NAMESPACE_END
#endif // !UCONFIG_NO_FORMATTING
#endif // __MESSAGEIMPL_H__

File diff suppressed because it is too large Load diff

270
engine/thirdparty/icu4c/common/mlbe.cpp vendored Normal file
View file

@ -0,0 +1,270 @@
// © 2022 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
#include "unicode/utypes.h"
#if !UCONFIG_NO_BREAK_ITERATION
#include "cmemory.h"
#include "mlbe.h"
#include "uassert.h"
#include "ubrkimpl.h"
#include "unicode/resbund.h"
#include "unicode/udata.h"
#include "unicode/utf16.h"
#include "uresimp.h"
#include "util.h"
#include "uvectr32.h"
U_NAMESPACE_BEGIN
enum class ModelIndex { kUWStart = 0, kBWStart = 6, kTWStart = 9 };
MlBreakEngine::MlBreakEngine(const UnicodeSet &digitOrOpenPunctuationOrAlphabetSet,
const UnicodeSet &closePunctuationSet, UErrorCode &status)
: fDigitOrOpenPunctuationOrAlphabetSet(digitOrOpenPunctuationOrAlphabetSet),
fClosePunctuationSet(closePunctuationSet),
fNegativeSum(0) {
if (U_FAILURE(status)) {
return;
}
loadMLModel(status);
}
MlBreakEngine::~MlBreakEngine() {}
int32_t MlBreakEngine::divideUpRange(UText *inText, int32_t rangeStart, int32_t rangeEnd,
UVector32 &foundBreaks, const UnicodeString &inString,
const LocalPointer<UVector32> &inputMap,
UErrorCode &status) const {
if (U_FAILURE(status)) {
return 0;
}
if (rangeStart >= rangeEnd) {
status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
UVector32 boundary(inString.countChar32() + 1, status);
if (U_FAILURE(status)) {
return 0;
}
int32_t numBreaks = 0;
int32_t codePointLength = inString.countChar32();
// The ML algorithm groups six char and evaluates whether the 4th char is a breakpoint.
// In each iteration, it evaluates the 4th char and then moves forward one char like a sliding
// window. Initially, the first six values in the indexList are [-1, -1, 0, 1, 2, 3]. After
// moving forward, finally the last six values in the indexList are
// [length-4, length-3, length-2, length-1, -1, -1]. The "+4" here means four extra "-1".
int32_t indexSize = codePointLength + 4;
int32_t *indexList = (int32_t *)uprv_malloc(indexSize * sizeof(int32_t));
if (indexList == nullptr) {
status = U_MEMORY_ALLOCATION_ERROR;
return 0;
}
int32_t numCodeUnits = initIndexList(inString, indexList, status);
// Add a break for the start.
boundary.addElement(0, status);
numBreaks++;
if (U_FAILURE(status)) return 0;
for (int32_t idx = 0; idx + 1 < codePointLength && U_SUCCESS(status); idx++) {
numBreaks =
evaluateBreakpoint(inString, indexList, idx, numCodeUnits, numBreaks, boundary, status);
if (idx + 4 < codePointLength) {
indexList[idx + 6] = numCodeUnits;
numCodeUnits += U16_LENGTH(inString.char32At(indexList[idx + 6]));
}
}
uprv_free(indexList);
if (U_FAILURE(status)) return 0;
// Add a break for the end if there is not one there already.
if (boundary.lastElementi() != inString.countChar32()) {
boundary.addElement(inString.countChar32(), status);
numBreaks++;
}
int32_t prevCPPos = -1;
int32_t prevUTextPos = -1;
int32_t correctedNumBreaks = 0;
for (int32_t i = 0; i < numBreaks; i++) {
int32_t cpPos = boundary.elementAti(i);
int32_t utextPos = inputMap.isValid() ? inputMap->elementAti(cpPos) : cpPos + rangeStart;
U_ASSERT(cpPos > prevCPPos);
U_ASSERT(utextPos >= prevUTextPos);
if (utextPos > prevUTextPos) {
if (utextPos != rangeStart ||
(utextPos > 0 &&
fClosePunctuationSet.contains(utext_char32At(inText, utextPos - 1)))) {
foundBreaks.push(utextPos, status);
correctedNumBreaks++;
}
} else {
// Normalization expanded the input text, the dictionary found a boundary
// within the expansion, giving two boundaries with the same index in the
// original text. Ignore the second. See ticket #12918.
--numBreaks;
}
prevCPPos = cpPos;
prevUTextPos = utextPos;
}
(void)prevCPPos; // suppress compiler warnings about unused variable
UChar32 nextChar = utext_char32At(inText, rangeEnd);
if (!foundBreaks.isEmpty() && foundBreaks.peeki() == rangeEnd) {
// In phrase breaking, there has to be a breakpoint between Cj character and
// the number/open punctuation.
// E.g. る文字「そうだ、京都」->る▁文字▁「そうだ、▁京都」-> breakpoint between 字 and「
// E.g. 乗車率90%程度だろうか -> 乗車▁率▁90%▁程度だろうか -> breakpoint between 率 and
// E.g. しかもロゴがUnicode! -> しかも▁ロゴが▁Unicode!-> breakpoint between が and
if (!fDigitOrOpenPunctuationOrAlphabetSet.contains(nextChar)) {
foundBreaks.popi();
correctedNumBreaks--;
}
}
return correctedNumBreaks;
}
int32_t MlBreakEngine::evaluateBreakpoint(const UnicodeString &inString, int32_t *indexList,
int32_t startIdx, int32_t numCodeUnits, int32_t numBreaks,
UVector32 &boundary, UErrorCode &status) const {
if (U_FAILURE(status)) {
return numBreaks;
}
int32_t start = 0, end = 0;
int32_t score = fNegativeSum;
for (int i = 0; i < 6; i++) {
// UW1 ~ UW6
start = startIdx + i;
if (indexList[start] != -1) {
end = (indexList[start + 1] != -1) ? indexList[start + 1] : numCodeUnits;
score += fModel[static_cast<int32_t>(ModelIndex::kUWStart) + i].geti(
inString.tempSubString(indexList[start], end - indexList[start]));
}
}
for (int i = 0; i < 3; i++) {
// BW1 ~ BW3
start = startIdx + i + 1;
if (indexList[start] != -1 && indexList[start + 1] != -1) {
end = (indexList[start + 2] != -1) ? indexList[start + 2] : numCodeUnits;
score += fModel[static_cast<int32_t>(ModelIndex::kBWStart) + i].geti(
inString.tempSubString(indexList[start], end - indexList[start]));
}
}
for (int i = 0; i < 4; i++) {
// TW1 ~ TW4
start = startIdx + i;
if (indexList[start] != -1 && indexList[start + 1] != -1 && indexList[start + 2] != -1) {
end = (indexList[start + 3] != -1) ? indexList[start + 3] : numCodeUnits;
score += fModel[static_cast<int32_t>(ModelIndex::kTWStart) + i].geti(
inString.tempSubString(indexList[start], end - indexList[start]));
}
}
if (score > 0) {
boundary.addElement(startIdx + 1, status);
numBreaks++;
}
return numBreaks;
}
int32_t MlBreakEngine::initIndexList(const UnicodeString &inString, int32_t *indexList,
UErrorCode &status) const {
if (U_FAILURE(status)) {
return 0;
}
int32_t index = 0;
int32_t length = inString.countChar32();
// Set all (lenght+4) items inside indexLength to -1 presuming -1 is 4 bytes of 0xff.
uprv_memset(indexList, 0xff, (length + 4) * sizeof(int32_t));
if (length > 0) {
indexList[2] = 0;
index = U16_LENGTH(inString.char32At(0));
if (length > 1) {
indexList[3] = index;
index += U16_LENGTH(inString.char32At(index));
if (length > 2) {
indexList[4] = index;
index += U16_LENGTH(inString.char32At(index));
if (length > 3) {
indexList[5] = index;
index += U16_LENGTH(inString.char32At(index));
}
}
}
}
return index;
}
void MlBreakEngine::loadMLModel(UErrorCode &error) {
// BudouX's model consists of thirteen categories, each of which is make up of pairs of the
// feature and its score. As integrating it into jaml.txt, we define thirteen kinds of key and
// value to represent the feature and the corresponding score respectively.
if (U_FAILURE(error)) return;
UnicodeString key;
StackUResourceBundle stackTempBundle;
ResourceDataValue modelKey;
LocalUResourceBundlePointer rbp(ures_openDirect(U_ICUDATA_BRKITR, "jaml", &error));
UResourceBundle *rb = rbp.getAlias();
if (U_FAILURE(error)) return;
int32_t index = 0;
initKeyValue(rb, "UW1Keys", "UW1Values", fModel[index++], error);
initKeyValue(rb, "UW2Keys", "UW2Values", fModel[index++], error);
initKeyValue(rb, "UW3Keys", "UW3Values", fModel[index++], error);
initKeyValue(rb, "UW4Keys", "UW4Values", fModel[index++], error);
initKeyValue(rb, "UW5Keys", "UW5Values", fModel[index++], error);
initKeyValue(rb, "UW6Keys", "UW6Values", fModel[index++], error);
initKeyValue(rb, "BW1Keys", "BW1Values", fModel[index++], error);
initKeyValue(rb, "BW2Keys", "BW2Values", fModel[index++], error);
initKeyValue(rb, "BW3Keys", "BW3Values", fModel[index++], error);
initKeyValue(rb, "TW1Keys", "TW1Values", fModel[index++], error);
initKeyValue(rb, "TW2Keys", "TW2Values", fModel[index++], error);
initKeyValue(rb, "TW3Keys", "TW3Values", fModel[index++], error);
initKeyValue(rb, "TW4Keys", "TW4Values", fModel[index++], error);
fNegativeSum /= 2;
}
void MlBreakEngine::initKeyValue(UResourceBundle *rb, const char *keyName, const char *valueName,
Hashtable &model, UErrorCode &error) {
int32_t keySize = 0;
int32_t valueSize = 0;
int32_t stringLength = 0;
UnicodeString key;
StackUResourceBundle stackTempBundle;
ResourceDataValue modelKey;
// get modelValues
LocalUResourceBundlePointer modelValue(ures_getByKey(rb, valueName, nullptr, &error));
const int32_t *value = ures_getIntVector(modelValue.getAlias(), &valueSize, &error);
if (U_FAILURE(error)) return;
// get modelKeys
ures_getValueWithFallback(rb, keyName, stackTempBundle.getAlias(), modelKey, error);
ResourceArray stringArray = modelKey.getArray(error);
keySize = stringArray.getSize();
if (U_FAILURE(error)) return;
for (int32_t idx = 0; idx < keySize; idx++) {
stringArray.getValue(idx, modelKey);
key = UnicodeString(modelKey.getString(stringLength, error));
if (U_SUCCESS(error)) {
U_ASSERT(idx < valueSize);
fNegativeSum -= value[idx];
model.puti(key, value[idx], error);
}
}
}
U_NAMESPACE_END
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */

116
engine/thirdparty/icu4c/common/mlbe.h vendored Normal file
View file

@ -0,0 +1,116 @@
// © 2022 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
#ifndef MLBREAKENGINE_H
#define MLBREAKENGINE_H
#include "hash.h"
#include "unicode/resbund.h"
#include "unicode/uniset.h"
#include "unicode/utext.h"
#include "uvectr32.h"
U_NAMESPACE_BEGIN
#if !UCONFIG_NO_BREAK_ITERATION
/**
* A machine learning break engine for the phrase breaking in Japanese.
*/
class MlBreakEngine : public UMemory {
public:
/**
* Constructor.
*
* @param digitOrOpenPunctuationOrAlphabetSet An UnicodeSet with the digit, open punctuation and
* alphabet.
* @param closePunctuationSet An UnicodeSet with close punctuation.
* @param status Information on any errors encountered.
*/
MlBreakEngine(const UnicodeSet &digitOrOpenPunctuationOrAlphabetSet,
const UnicodeSet &closePunctuationSet, UErrorCode &status);
/**
* Virtual destructor.
*/
virtual ~MlBreakEngine();
public:
/**
* Divide up a range of characters handled by this break engine.
*
* @param inText A UText representing the text
* @param rangeStart The start of the range of the characters
* @param rangeEnd The end of the range of the characters
* @param foundBreaks Output of C array of int32_t break positions, or 0
* @param inString The normalized string of text ranging from rangeStart to rangeEnd
* @param inputMap The vector storing the native index of inText
* @param status Information on any errors encountered.
* @return The number of breaks found
*/
int32_t divideUpRange(UText *inText, int32_t rangeStart, int32_t rangeEnd,
UVector32 &foundBreaks, const UnicodeString &inString,
const LocalPointer<UVector32> &inputMap, UErrorCode &status) const;
private:
/**
* Load the machine learning's model file.
*
* @param error Information on any errors encountered.
*/
void loadMLModel(UErrorCode &error);
/**
* In the machine learning's model file, specify the name of the key and value to load the
* corresponding feature and its score.
*
* @param rb A ResouceBundle corresponding to the model file.
* @param keyName The kay name in the model file.
* @param valueName The value name in the model file.
* @param model A hashtable to store the pairs of the feature and its score.
* @param error Information on any errors encountered.
*/
void initKeyValue(UResourceBundle *rb, const char *keyName, const char *valueName,
Hashtable &model, UErrorCode &error);
/**
* Initialize the index list from the input string.
*
* @param inString A input string to be segmented.
* @param indexList A code unit index list of inString.
* @param status Information on any errors encountered.
* @return The number of code units of the first four characters in inString.
*/
int32_t initIndexList(const UnicodeString &inString, int32_t *indexList,
UErrorCode &status) const;
/**
* Evaluate whether the index is a potential breakpoint.
*
* @param inString A input string to be segmented.
* @param indexList A code unit index list of the inString.
* @param startIdx The start index of the indexList.
* @param numCodeUnits The current code unit boundary of the indexList.
* @param numBreaks The accumulated number of breakpoints.
* @param boundary A vector including the index of the breakpoint.
* @param status Information on any errors encountered.
* @return The number of breakpoints
*/
int32_t evaluateBreakpoint(const UnicodeString &inString, int32_t *indexList, int32_t startIdx,
int32_t numCodeUnits, int32_t numBreaks, UVector32 &boundary,
UErrorCode &status) const;
void printUnicodeString(const UnicodeString &s) const;
UnicodeSet fDigitOrOpenPunctuationOrAlphabetSet;
UnicodeSet fClosePunctuationSet;
Hashtable fModel[13]; // {UW1, UW2, ... UW6, BW1, ... BW3, TW1, TW2, ... TW4} 6+3+4= 13
int32_t fNegativeSum;
};
#endif
U_NAMESPACE_END
/* MLBREAKENGINE_H */
#endif

View file

@ -0,0 +1,25 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
//{{NO_DEPENDENCIES}}
// Copyright (c) 2003-2010 International Business Machines
// Corporation and others. All Rights Reserved.
//
// Used by common.rc and other .rc files.
//Do not edit with Microsoft Developer Studio because it will modify this
//header the wrong way. This is here to prevent Visual Studio .NET from
//unnessarily building the resource files when it's not needed.
//
/*
These are defined before unicode/uversion.h in order to prevent
STLPort's broken stddef.h from being used when rc.exe parses this file.
*/
#define _STLP_OUTERMOST_HEADER_ID 0
#define _STLP_WINCE 1
#include "unicode/uversion.h"
#define ICU_WEBSITE "https://icu.unicode.org/"
#define ICU_COMPANY "The ICU Project"
#define ICU_PRODUCT_PREFIX "ICU"
#define ICU_PRODUCT "International Components for Unicode"

77
engine/thirdparty/icu4c/common/mutex.h vendored Normal file
View file

@ -0,0 +1,77 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
******************************************************************************
*
* Copyright (C) 1997-2013, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
*/
//----------------------------------------------------------------------------
// File: mutex.h
//
// Lightweight C++ wrapper for umtx_ C mutex functions
//
// Author: Alan Liu 1/31/97
// History:
// 06/04/97 helena Updated setImplementation as per feedback from 5/21 drop.
// 04/07/1999 srl refocused as a thin wrapper
//
//----------------------------------------------------------------------------
#ifndef MUTEX_H
#define MUTEX_H
#include "unicode/utypes.h"
#include "unicode/uobject.h"
#include "umutex.h"
U_NAMESPACE_BEGIN
/**
* Mutex is a helper class for convenient locking and unlocking of a UMutex.
*
* Creating a local scope Mutex will lock a UMutex, holding the lock until the Mutex
* goes out of scope.
*
* If no UMutex is specified, the ICU global mutex is implied.
*
* For example:
*
* static UMutex myMutex;
*
* void Function(int arg1, int arg2)
* {
* static Object* foo; // Shared read-write object
* Mutex mutex(&myMutex); // or no args for the global lock
* foo->Method();
* // When 'mutex' goes out of scope and gets destroyed here, the lock is released
* }
*
* Note: Do NOT use the form 'Mutex mutex();' as that merely forward-declares a function
* returning a Mutex. This is a common mistake which silently slips through the
* compiler!!
*/
class U_COMMON_API Mutex : public UMemory {
public:
Mutex(UMutex *mutex = nullptr) : fMutex(mutex) {
umtx_lock(fMutex);
}
~Mutex() {
umtx_unlock(fMutex);
}
Mutex(const Mutex &other) = delete; // forbid assigning of this class
Mutex &operator=(const Mutex &other) = delete; // forbid copying of this class
void *operator new(size_t s) = delete; // forbid heap allocation. Locals only.
private:
UMutex *fMutex;
};
U_NAMESPACE_END
#endif //_MUTEX_
//eof

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,406 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
* Copyright (C) 2014, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* norm2allmodes.h
*
* created on: 2014sep07
* created by: Markus W. Scherer
*/
#ifndef __NORM2ALLMODES_H__
#define __NORM2ALLMODES_H__
#include "unicode/utypes.h"
#if !UCONFIG_NO_NORMALIZATION
#include "unicode/edits.h"
#include "unicode/normalizer2.h"
#include "unicode/stringoptions.h"
#include "unicode/unistr.h"
#include "cpputils.h"
#include "normalizer2impl.h"
U_NAMESPACE_BEGIN
// Intermediate class:
// Has Normalizer2Impl and does boilerplate argument checking and setup.
class Normalizer2WithImpl : public Normalizer2 {
public:
Normalizer2WithImpl(const Normalizer2Impl &ni) : impl(ni) {}
virtual ~Normalizer2WithImpl();
// normalize
virtual UnicodeString &
normalize(const UnicodeString &src,
UnicodeString &dest,
UErrorCode &errorCode) const override {
if(U_FAILURE(errorCode)) {
dest.setToBogus();
return dest;
}
const char16_t *sArray=src.getBuffer();
if(&dest==&src || sArray==nullptr) {
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
dest.setToBogus();
return dest;
}
dest.remove();
ReorderingBuffer buffer(impl, dest);
if(buffer.init(src.length(), errorCode)) {
normalize(sArray, sArray+src.length(), buffer, errorCode);
}
return dest;
}
virtual void
normalize(const char16_t *src, const char16_t *limit,
ReorderingBuffer &buffer, UErrorCode &errorCode) const = 0;
// normalize and append
virtual UnicodeString &
normalizeSecondAndAppend(UnicodeString &first,
const UnicodeString &second,
UErrorCode &errorCode) const override {
return normalizeSecondAndAppend(first, second, true, errorCode);
}
virtual UnicodeString &
append(UnicodeString &first,
const UnicodeString &second,
UErrorCode &errorCode) const override {
return normalizeSecondAndAppend(first, second, false, errorCode);
}
UnicodeString &
normalizeSecondAndAppend(UnicodeString &first,
const UnicodeString &second,
UBool doNormalize,
UErrorCode &errorCode) const {
uprv_checkCanGetBuffer(first, errorCode);
if(U_FAILURE(errorCode)) {
return first;
}
const char16_t *secondArray=second.getBuffer();
if(&first==&second || secondArray==nullptr) {
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
return first;
}
int32_t firstLength=first.length();
UnicodeString safeMiddle;
{
ReorderingBuffer buffer(impl, first);
if(buffer.init(firstLength+second.length(), errorCode)) {
normalizeAndAppend(secondArray, secondArray+second.length(), doNormalize,
safeMiddle, buffer, errorCode);
}
} // The ReorderingBuffer destructor finalizes the first string.
if(U_FAILURE(errorCode)) {
// Restore the modified suffix of the first string.
first.replace(firstLength-safeMiddle.length(), 0x7fffffff, safeMiddle);
}
return first;
}
virtual void
normalizeAndAppend(const char16_t *src, const char16_t *limit, UBool doNormalize,
UnicodeString &safeMiddle,
ReorderingBuffer &buffer, UErrorCode &errorCode) const = 0;
virtual UBool
getDecomposition(UChar32 c, UnicodeString &decomposition) const override {
char16_t buffer[4];
int32_t length;
const char16_t *d=impl.getDecomposition(c, buffer, length);
if(d==nullptr) {
return false;
}
if(d==buffer) {
decomposition.setTo(buffer, length); // copy the string (Jamos from Hangul syllable c)
} else {
decomposition.setTo(false, d, length); // read-only alias
}
return true;
}
virtual UBool
getRawDecomposition(UChar32 c, UnicodeString &decomposition) const override {
char16_t buffer[30];
int32_t length;
const char16_t *d=impl.getRawDecomposition(c, buffer, length);
if(d==nullptr) {
return false;
}
if(d==buffer) {
decomposition.setTo(buffer, length); // copy the string (algorithmic decomposition)
} else {
decomposition.setTo(false, d, length); // read-only alias
}
return true;
}
virtual UChar32
composePair(UChar32 a, UChar32 b) const override {
return impl.composePair(a, b);
}
virtual uint8_t
getCombiningClass(UChar32 c) const override {
return impl.getCC(impl.getNorm16(c));
}
// quick checks
virtual UBool
isNormalized(const UnicodeString &s, UErrorCode &errorCode) const override {
if(U_FAILURE(errorCode)) {
return false;
}
const char16_t *sArray=s.getBuffer();
if(sArray==nullptr) {
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
return false;
}
const char16_t *sLimit=sArray+s.length();
return sLimit==spanQuickCheckYes(sArray, sLimit, errorCode);
}
virtual UNormalizationCheckResult
quickCheck(const UnicodeString &s, UErrorCode &errorCode) const override {
return Normalizer2WithImpl::isNormalized(s, errorCode) ? UNORM_YES : UNORM_NO;
}
virtual int32_t
spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const override {
if(U_FAILURE(errorCode)) {
return 0;
}
const char16_t *sArray=s.getBuffer();
if(sArray==nullptr) {
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
return (int32_t)(spanQuickCheckYes(sArray, sArray+s.length(), errorCode)-sArray);
}
virtual const char16_t *
spanQuickCheckYes(const char16_t *src, const char16_t *limit, UErrorCode &errorCode) const = 0;
virtual UNormalizationCheckResult getQuickCheck(UChar32) const {
return UNORM_YES;
}
const Normalizer2Impl &impl;
};
class DecomposeNormalizer2 : public Normalizer2WithImpl {
public:
DecomposeNormalizer2(const Normalizer2Impl &ni) : Normalizer2WithImpl(ni) {}
virtual ~DecomposeNormalizer2();
private:
virtual void
normalize(const char16_t *src, const char16_t *limit,
ReorderingBuffer &buffer, UErrorCode &errorCode) const override {
impl.decompose(src, limit, &buffer, errorCode);
}
using Normalizer2WithImpl::normalize; // Avoid warning about hiding base class function.
virtual void
normalizeAndAppend(const char16_t *src, const char16_t *limit, UBool doNormalize,
UnicodeString &safeMiddle,
ReorderingBuffer &buffer, UErrorCode &errorCode) const override {
impl.decomposeAndAppend(src, limit, doNormalize, safeMiddle, buffer, errorCode);
}
void
normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
Edits *edits, UErrorCode &errorCode) const override {
if (U_FAILURE(errorCode)) {
return;
}
if (edits != nullptr && (options & U_EDITS_NO_RESET) == 0) {
edits->reset();
}
const uint8_t *s = reinterpret_cast<const uint8_t *>(src.data());
impl.decomposeUTF8(options, s, s + src.length(), &sink, edits, errorCode);
sink.Flush();
}
virtual UBool
isNormalizedUTF8(StringPiece sp, UErrorCode &errorCode) const override {
if(U_FAILURE(errorCode)) {
return false;
}
const uint8_t *s = reinterpret_cast<const uint8_t *>(sp.data());
const uint8_t *sLimit = s + sp.length();
return sLimit == impl.decomposeUTF8(0, s, sLimit, nullptr, nullptr, errorCode);
}
virtual const char16_t *
spanQuickCheckYes(const char16_t *src, const char16_t *limit, UErrorCode &errorCode) const override {
return impl.decompose(src, limit, nullptr, errorCode);
}
using Normalizer2WithImpl::spanQuickCheckYes; // Avoid warning about hiding base class function.
virtual UNormalizationCheckResult getQuickCheck(UChar32 c) const override {
return impl.isDecompYes(impl.getNorm16(c)) ? UNORM_YES : UNORM_NO;
}
virtual UBool hasBoundaryBefore(UChar32 c) const override {
return impl.hasDecompBoundaryBefore(c);
}
virtual UBool hasBoundaryAfter(UChar32 c) const override {
return impl.hasDecompBoundaryAfter(c);
}
virtual UBool isInert(UChar32 c) const override {
return impl.isDecompInert(c);
}
};
class ComposeNormalizer2 : public Normalizer2WithImpl {
public:
ComposeNormalizer2(const Normalizer2Impl &ni, UBool fcc) :
Normalizer2WithImpl(ni), onlyContiguous(fcc) {}
virtual ~ComposeNormalizer2();
private:
virtual void
normalize(const char16_t *src, const char16_t *limit,
ReorderingBuffer &buffer, UErrorCode &errorCode) const override {
impl.compose(src, limit, onlyContiguous, true, buffer, errorCode);
}
using Normalizer2WithImpl::normalize; // Avoid warning about hiding base class function.
void
normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
Edits *edits, UErrorCode &errorCode) const override {
if (U_FAILURE(errorCode)) {
return;
}
if (edits != nullptr && (options & U_EDITS_NO_RESET) == 0) {
edits->reset();
}
const uint8_t *s = reinterpret_cast<const uint8_t *>(src.data());
impl.composeUTF8(options, onlyContiguous, s, s + src.length(),
&sink, edits, errorCode);
sink.Flush();
}
virtual void
normalizeAndAppend(const char16_t *src, const char16_t *limit, UBool doNormalize,
UnicodeString &safeMiddle,
ReorderingBuffer &buffer, UErrorCode &errorCode) const override {
impl.composeAndAppend(src, limit, doNormalize, onlyContiguous, safeMiddle, buffer, errorCode);
}
virtual UBool
isNormalized(const UnicodeString &s, UErrorCode &errorCode) const override {
if(U_FAILURE(errorCode)) {
return false;
}
const char16_t *sArray=s.getBuffer();
if(sArray==nullptr) {
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
return false;
}
UnicodeString temp;
ReorderingBuffer buffer(impl, temp);
if(!buffer.init(5, errorCode)) { // small destCapacity for substring normalization
return false;
}
return impl.compose(sArray, sArray+s.length(), onlyContiguous, false, buffer, errorCode);
}
virtual UBool
isNormalizedUTF8(StringPiece sp, UErrorCode &errorCode) const override {
if(U_FAILURE(errorCode)) {
return false;
}
const uint8_t *s = reinterpret_cast<const uint8_t *>(sp.data());
return impl.composeUTF8(0, onlyContiguous, s, s + sp.length(), nullptr, nullptr, errorCode);
}
virtual UNormalizationCheckResult
quickCheck(const UnicodeString &s, UErrorCode &errorCode) const override {
if(U_FAILURE(errorCode)) {
return UNORM_MAYBE;
}
const char16_t *sArray=s.getBuffer();
if(sArray==nullptr) {
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
return UNORM_MAYBE;
}
UNormalizationCheckResult qcResult=UNORM_YES;
impl.composeQuickCheck(sArray, sArray+s.length(), onlyContiguous, &qcResult);
return qcResult;
}
virtual const char16_t *
spanQuickCheckYes(const char16_t *src, const char16_t *limit, UErrorCode &) const override {
return impl.composeQuickCheck(src, limit, onlyContiguous, nullptr);
}
using Normalizer2WithImpl::spanQuickCheckYes; // Avoid warning about hiding base class function.
virtual UNormalizationCheckResult getQuickCheck(UChar32 c) const override {
return impl.getCompQuickCheck(impl.getNorm16(c));
}
virtual UBool hasBoundaryBefore(UChar32 c) const override {
return impl.hasCompBoundaryBefore(c);
}
virtual UBool hasBoundaryAfter(UChar32 c) const override {
return impl.hasCompBoundaryAfter(c, onlyContiguous);
}
virtual UBool isInert(UChar32 c) const override {
return impl.isCompInert(c, onlyContiguous);
}
const UBool onlyContiguous;
};
class FCDNormalizer2 : public Normalizer2WithImpl {
public:
FCDNormalizer2(const Normalizer2Impl &ni) : Normalizer2WithImpl(ni) {}
virtual ~FCDNormalizer2();
private:
virtual void
normalize(const char16_t *src, const char16_t *limit,
ReorderingBuffer &buffer, UErrorCode &errorCode) const override {
impl.makeFCD(src, limit, &buffer, errorCode);
}
using Normalizer2WithImpl::normalize; // Avoid warning about hiding base class function.
virtual void
normalizeAndAppend(const char16_t *src, const char16_t *limit, UBool doNormalize,
UnicodeString &safeMiddle,
ReorderingBuffer &buffer, UErrorCode &errorCode) const override {
impl.makeFCDAndAppend(src, limit, doNormalize, safeMiddle, buffer, errorCode);
}
virtual const char16_t *
spanQuickCheckYes(const char16_t *src, const char16_t *limit, UErrorCode &errorCode) const override {
return impl.makeFCD(src, limit, nullptr, errorCode);
}
using Normalizer2WithImpl::spanQuickCheckYes; // Avoid warning about hiding base class function.
virtual UBool hasBoundaryBefore(UChar32 c) const override {
return impl.hasFCDBoundaryBefore(c);
}
virtual UBool hasBoundaryAfter(UChar32 c) const override {
return impl.hasFCDBoundaryAfter(c);
}
virtual UBool isInert(UChar32 c) const override {
return impl.isFCDInert(c);
}
};
struct Norm2AllModes : public UMemory {
Norm2AllModes(Normalizer2Impl *i)
: impl(i), comp(*i, false), decomp(*i), fcd(*i), fcc(*i, true) {}
~Norm2AllModes();
static Norm2AllModes *createInstance(Normalizer2Impl *impl, UErrorCode &errorCode);
static Norm2AllModes *createNFCInstance(UErrorCode &errorCode);
static Norm2AllModes *createInstance(const char *packageName,
const char *name,
UErrorCode &errorCode);
static const Norm2AllModes *getNFCInstance(UErrorCode &errorCode);
static const Norm2AllModes *getNFKCInstance(UErrorCode &errorCode);
static const Norm2AllModes *getNFKC_CFInstance(UErrorCode &errorCode);
static const Norm2AllModes *getNFKC_SCFInstance(UErrorCode &errorCode);
Normalizer2Impl *impl;
ComposeNormalizer2 comp;
DecomposeNormalizer2 decomp;
FCDNormalizer2 fcd;
ComposeNormalizer2 fcc;
};
U_NAMESPACE_END
#endif // !UCONFIG_NO_NORMALIZATION
#endif // __NORM2ALLMODES_H__

View file

@ -0,0 +1,572 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
*
* Copyright (C) 2009-2016, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: normalizer2.cpp
* encoding: UTF-8
* tab size: 8 (not used)
* indentation:4
*
* created on: 2009nov22
* created by: Markus W. Scherer
*/
#include "unicode/utypes.h"
#if !UCONFIG_NO_NORMALIZATION
#include "unicode/edits.h"
#include "unicode/normalizer2.h"
#include "unicode/stringoptions.h"
#include "unicode/unistr.h"
#include "unicode/unorm.h"
#include "cstring.h"
#include "mutex.h"
#include "norm2allmodes.h"
#include "normalizer2impl.h"
#include "uassert.h"
#include "ucln_cmn.h"
using icu::Normalizer2Impl;
#if NORM2_HARDCODE_NFC_DATA
// NFC/NFD data machine-generated by gennorm2 --csource
#define INCLUDED_FROM_NORMALIZER2_CPP
#include "norm2_nfc_data.h"
#endif
U_NAMESPACE_BEGIN
// Public API dispatch via Normalizer2 subclasses -------------------------- ***
Normalizer2::~Normalizer2() {}
void
Normalizer2::normalizeUTF8(uint32_t /*options*/, StringPiece src, ByteSink &sink,
Edits *edits, UErrorCode &errorCode) const {
if (U_FAILURE(errorCode)) {
return;
}
if (edits != nullptr) {
errorCode = U_UNSUPPORTED_ERROR;
return;
}
UnicodeString src16 = UnicodeString::fromUTF8(src);
normalize(src16, errorCode).toUTF8(sink);
}
UBool
Normalizer2::getRawDecomposition(UChar32, UnicodeString &) const {
return false;
}
UChar32
Normalizer2::composePair(UChar32, UChar32) const {
return U_SENTINEL;
}
uint8_t
Normalizer2::getCombiningClass(UChar32 /*c*/) const {
return 0;
}
UBool
Normalizer2::isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const {
return U_SUCCESS(errorCode) && isNormalized(UnicodeString::fromUTF8(s), errorCode);
}
// Normalizer2 implementation for the old UNORM_NONE.
class NoopNormalizer2 : public Normalizer2 {
virtual ~NoopNormalizer2();
virtual UnicodeString &
normalize(const UnicodeString &src,
UnicodeString &dest,
UErrorCode &errorCode) const override {
if(U_SUCCESS(errorCode)) {
if(&dest!=&src) {
dest=src;
} else {
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
}
}
return dest;
}
virtual void
normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
Edits *edits, UErrorCode &errorCode) const override {
if(U_SUCCESS(errorCode)) {
if (edits != nullptr) {
if ((options & U_EDITS_NO_RESET) == 0) {
edits->reset();
}
edits->addUnchanged(src.length());
}
if ((options & U_OMIT_UNCHANGED_TEXT) == 0) {
sink.Append(src.data(), src.length());
}
sink.Flush();
}
}
virtual UnicodeString &
normalizeSecondAndAppend(UnicodeString &first,
const UnicodeString &second,
UErrorCode &errorCode) const override {
if(U_SUCCESS(errorCode)) {
if(&first!=&second) {
first.append(second);
} else {
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
}
}
return first;
}
virtual UnicodeString &
append(UnicodeString &first,
const UnicodeString &second,
UErrorCode &errorCode) const override {
if(U_SUCCESS(errorCode)) {
if(&first!=&second) {
first.append(second);
} else {
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
}
}
return first;
}
virtual UBool
getDecomposition(UChar32, UnicodeString &) const override {
return false;
}
// No need to override the default getRawDecomposition().
virtual UBool
isNormalized(const UnicodeString &, UErrorCode &errorCode) const override {
return U_SUCCESS(errorCode);
}
virtual UBool
isNormalizedUTF8(StringPiece, UErrorCode &errorCode) const override {
return U_SUCCESS(errorCode);
}
virtual UNormalizationCheckResult
quickCheck(const UnicodeString &, UErrorCode &) const override {
return UNORM_YES;
}
virtual int32_t
spanQuickCheckYes(const UnicodeString &s, UErrorCode &) const override {
return s.length();
}
virtual UBool hasBoundaryBefore(UChar32) const override { return true; }
virtual UBool hasBoundaryAfter(UChar32) const override { return true; }
virtual UBool isInert(UChar32) const override { return true; }
};
NoopNormalizer2::~NoopNormalizer2() {}
Normalizer2WithImpl::~Normalizer2WithImpl() {}
DecomposeNormalizer2::~DecomposeNormalizer2() {}
ComposeNormalizer2::~ComposeNormalizer2() {}
FCDNormalizer2::~FCDNormalizer2() {}
// instance cache ---------------------------------------------------------- ***
U_CDECL_BEGIN
static UBool U_CALLCONV uprv_normalizer2_cleanup();
U_CDECL_END
static Normalizer2 *noopSingleton;
static icu::UInitOnce noopInitOnce {};
static void U_CALLCONV initNoopSingleton(UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) {
return;
}
noopSingleton=new NoopNormalizer2;
if(noopSingleton==nullptr) {
errorCode=U_MEMORY_ALLOCATION_ERROR;
return;
}
ucln_common_registerCleanup(UCLN_COMMON_NORMALIZER2, uprv_normalizer2_cleanup);
}
const Normalizer2 *Normalizer2Factory::getNoopInstance(UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) { return nullptr; }
umtx_initOnce(noopInitOnce, &initNoopSingleton, errorCode);
return noopSingleton;
}
const Normalizer2Impl *
Normalizer2Factory::getImpl(const Normalizer2 *norm2) {
return &((Normalizer2WithImpl *)norm2)->impl;
}
Norm2AllModes::~Norm2AllModes() {
delete impl;
}
Norm2AllModes *
Norm2AllModes::createInstance(Normalizer2Impl *impl, UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) {
delete impl;
return nullptr;
}
Norm2AllModes *allModes=new Norm2AllModes(impl);
if(allModes==nullptr) {
errorCode=U_MEMORY_ALLOCATION_ERROR;
delete impl;
return nullptr;
}
return allModes;
}
#if NORM2_HARDCODE_NFC_DATA
Norm2AllModes *
Norm2AllModes::createNFCInstance(UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) {
return nullptr;
}
Normalizer2Impl *impl=new Normalizer2Impl;
if(impl==nullptr) {
errorCode=U_MEMORY_ALLOCATION_ERROR;
return nullptr;
}
impl->init(norm2_nfc_data_indexes, &norm2_nfc_data_trie,
norm2_nfc_data_extraData, norm2_nfc_data_smallFCD);
return createInstance(impl, errorCode);
}
static Norm2AllModes *nfcSingleton;
static icu::UInitOnce nfcInitOnce {};
static void U_CALLCONV initNFCSingleton(UErrorCode &errorCode) {
nfcSingleton=Norm2AllModes::createNFCInstance(errorCode);
ucln_common_registerCleanup(UCLN_COMMON_NORMALIZER2, uprv_normalizer2_cleanup);
}
const Norm2AllModes *
Norm2AllModes::getNFCInstance(UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) { return nullptr; }
umtx_initOnce(nfcInitOnce, &initNFCSingleton, errorCode);
return nfcSingleton;
}
const Normalizer2 *
Normalizer2::getNFCInstance(UErrorCode &errorCode) {
const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
return allModes!=nullptr ? &allModes->comp : nullptr;
}
const Normalizer2 *
Normalizer2::getNFDInstance(UErrorCode &errorCode) {
const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
return allModes!=nullptr ? &allModes->decomp : nullptr;
}
const Normalizer2 *Normalizer2Factory::getFCDInstance(UErrorCode &errorCode) {
const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
return allModes!=nullptr ? &allModes->fcd : nullptr;
}
const Normalizer2 *Normalizer2Factory::getFCCInstance(UErrorCode &errorCode) {
const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
return allModes!=nullptr ? &allModes->fcc : nullptr;
}
const Normalizer2Impl *
Normalizer2Factory::getNFCImpl(UErrorCode &errorCode) {
const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
return allModes!=nullptr ? allModes->impl : nullptr;
}
#endif // NORM2_HARDCODE_NFC_DATA
U_CDECL_BEGIN
static UBool U_CALLCONV uprv_normalizer2_cleanup() {
delete noopSingleton;
noopSingleton = nullptr;
noopInitOnce.reset();
#if NORM2_HARDCODE_NFC_DATA
delete nfcSingleton;
nfcSingleton = nullptr;
nfcInitOnce.reset();
#endif
return true;
}
U_CDECL_END
U_NAMESPACE_END
// C API ------------------------------------------------------------------- ***
U_NAMESPACE_USE
U_CAPI const UNormalizer2 * U_EXPORT2
unorm2_getNFCInstance(UErrorCode *pErrorCode) {
return (const UNormalizer2 *)Normalizer2::getNFCInstance(*pErrorCode);
}
U_CAPI const UNormalizer2 * U_EXPORT2
unorm2_getNFDInstance(UErrorCode *pErrorCode) {
return (const UNormalizer2 *)Normalizer2::getNFDInstance(*pErrorCode);
}
U_CAPI void U_EXPORT2
unorm2_close(UNormalizer2 *norm2) {
delete (Normalizer2 *)norm2;
}
U_CAPI int32_t U_EXPORT2
unorm2_normalize(const UNormalizer2 *norm2,
const char16_t *src, int32_t length,
char16_t *dest, int32_t capacity,
UErrorCode *pErrorCode) {
if(U_FAILURE(*pErrorCode)) {
return 0;
}
if( (src==nullptr ? length!=0 : length<-1) ||
(dest==nullptr ? capacity!=0 : capacity<0) ||
(src==dest && src!=nullptr)
) {
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
UnicodeString destString(dest, 0, capacity);
// length==0: Nothing to do, and n2wi->normalize(nullptr, nullptr, buffer, ...) would crash.
if(length!=0) {
const Normalizer2 *n2=(const Normalizer2 *)norm2;
const Normalizer2WithImpl *n2wi=dynamic_cast<const Normalizer2WithImpl *>(n2);
if(n2wi!=nullptr) {
// Avoid duplicate argument checking and support NUL-terminated src.
ReorderingBuffer buffer(n2wi->impl, destString);
if(buffer.init(length, *pErrorCode)) {
n2wi->normalize(src, length>=0 ? src+length : nullptr, buffer, *pErrorCode);
}
} else {
UnicodeString srcString(length<0, src, length);
n2->normalize(srcString, destString, *pErrorCode);
}
}
return destString.extract(dest, capacity, *pErrorCode);
}
static int32_t
normalizeSecondAndAppend(const UNormalizer2 *norm2,
char16_t *first, int32_t firstLength, int32_t firstCapacity,
const char16_t *second, int32_t secondLength,
UBool doNormalize,
UErrorCode *pErrorCode) {
if(U_FAILURE(*pErrorCode)) {
return 0;
}
if( (second==nullptr ? secondLength!=0 : secondLength<-1) ||
(first==nullptr ? (firstCapacity!=0 || firstLength!=0) :
(firstCapacity<0 || firstLength<-1)) ||
(first==second && first!=nullptr)
) {
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
UnicodeString firstString(first, firstLength, firstCapacity);
firstLength=firstString.length(); // In case it was -1.
// secondLength==0: Nothing to do, and n2wi->normalizeAndAppend(nullptr, nullptr, buffer, ...) would crash.
if(secondLength!=0) {
const Normalizer2 *n2=(const Normalizer2 *)norm2;
const Normalizer2WithImpl *n2wi=dynamic_cast<const Normalizer2WithImpl *>(n2);
if(n2wi!=nullptr) {
// Avoid duplicate argument checking and support NUL-terminated src.
UnicodeString safeMiddle;
{
ReorderingBuffer buffer(n2wi->impl, firstString);
if(buffer.init(firstLength+secondLength+1, *pErrorCode)) { // destCapacity>=-1
n2wi->normalizeAndAppend(second, secondLength>=0 ? second+secondLength : nullptr,
doNormalize, safeMiddle, buffer, *pErrorCode);
}
} // The ReorderingBuffer destructor finalizes firstString.
if(U_FAILURE(*pErrorCode) || firstString.length()>firstCapacity) {
// Restore the modified suffix of the first string.
// This does not restore first[] array contents between firstLength and firstCapacity.
// (That might be uninitialized memory, as far as we know.)
if(first!=nullptr) { /* don't dereference nullptr */
safeMiddle.extract(0, 0x7fffffff, first+firstLength-safeMiddle.length());
if(firstLength<firstCapacity) {
first[firstLength]=0; // NUL-terminate in case it was originally.
}
}
}
} else {
UnicodeString secondString(secondLength<0, second, secondLength);
if(doNormalize) {
n2->normalizeSecondAndAppend(firstString, secondString, *pErrorCode);
} else {
n2->append(firstString, secondString, *pErrorCode);
}
}
}
return firstString.extract(first, firstCapacity, *pErrorCode);
}
U_CAPI int32_t U_EXPORT2
unorm2_normalizeSecondAndAppend(const UNormalizer2 *norm2,
char16_t *first, int32_t firstLength, int32_t firstCapacity,
const char16_t *second, int32_t secondLength,
UErrorCode *pErrorCode) {
return normalizeSecondAndAppend(norm2,
first, firstLength, firstCapacity,
second, secondLength,
true, pErrorCode);
}
U_CAPI int32_t U_EXPORT2
unorm2_append(const UNormalizer2 *norm2,
char16_t *first, int32_t firstLength, int32_t firstCapacity,
const char16_t *second, int32_t secondLength,
UErrorCode *pErrorCode) {
return normalizeSecondAndAppend(norm2,
first, firstLength, firstCapacity,
second, secondLength,
false, pErrorCode);
}
U_CAPI int32_t U_EXPORT2
unorm2_getDecomposition(const UNormalizer2 *norm2,
UChar32 c, char16_t *decomposition, int32_t capacity,
UErrorCode *pErrorCode) {
if(U_FAILURE(*pErrorCode)) {
return 0;
}
if(decomposition==nullptr ? capacity!=0 : capacity<0) {
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
UnicodeString destString(decomposition, 0, capacity);
if(reinterpret_cast<const Normalizer2 *>(norm2)->getDecomposition(c, destString)) {
return destString.extract(decomposition, capacity, *pErrorCode);
} else {
return -1;
}
}
U_CAPI int32_t U_EXPORT2
unorm2_getRawDecomposition(const UNormalizer2 *norm2,
UChar32 c, char16_t *decomposition, int32_t capacity,
UErrorCode *pErrorCode) {
if(U_FAILURE(*pErrorCode)) {
return 0;
}
if(decomposition==nullptr ? capacity!=0 : capacity<0) {
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
UnicodeString destString(decomposition, 0, capacity);
if(reinterpret_cast<const Normalizer2 *>(norm2)->getRawDecomposition(c, destString)) {
return destString.extract(decomposition, capacity, *pErrorCode);
} else {
return -1;
}
}
U_CAPI UChar32 U_EXPORT2
unorm2_composePair(const UNormalizer2 *norm2, UChar32 a, UChar32 b) {
return reinterpret_cast<const Normalizer2 *>(norm2)->composePair(a, b);
}
U_CAPI uint8_t U_EXPORT2
unorm2_getCombiningClass(const UNormalizer2 *norm2, UChar32 c) {
return reinterpret_cast<const Normalizer2 *>(norm2)->getCombiningClass(c);
}
U_CAPI UBool U_EXPORT2
unorm2_isNormalized(const UNormalizer2 *norm2,
const char16_t *s, int32_t length,
UErrorCode *pErrorCode) {
if(U_FAILURE(*pErrorCode)) {
return 0;
}
if((s==nullptr && length!=0) || length<-1) {
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
UnicodeString sString(length<0, s, length);
return ((const Normalizer2 *)norm2)->isNormalized(sString, *pErrorCode);
}
U_CAPI UNormalizationCheckResult U_EXPORT2
unorm2_quickCheck(const UNormalizer2 *norm2,
const char16_t *s, int32_t length,
UErrorCode *pErrorCode) {
if(U_FAILURE(*pErrorCode)) {
return UNORM_NO;
}
if((s==nullptr && length!=0) || length<-1) {
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
return UNORM_NO;
}
UnicodeString sString(length<0, s, length);
return ((const Normalizer2 *)norm2)->quickCheck(sString, *pErrorCode);
}
U_CAPI int32_t U_EXPORT2
unorm2_spanQuickCheckYes(const UNormalizer2 *norm2,
const char16_t *s, int32_t length,
UErrorCode *pErrorCode) {
if(U_FAILURE(*pErrorCode)) {
return 0;
}
if((s==nullptr && length!=0) || length<-1) {
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
UnicodeString sString(length<0, s, length);
return ((const Normalizer2 *)norm2)->spanQuickCheckYes(sString, *pErrorCode);
}
U_CAPI UBool U_EXPORT2
unorm2_hasBoundaryBefore(const UNormalizer2 *norm2, UChar32 c) {
return ((const Normalizer2 *)norm2)->hasBoundaryBefore(c);
}
U_CAPI UBool U_EXPORT2
unorm2_hasBoundaryAfter(const UNormalizer2 *norm2, UChar32 c) {
return ((const Normalizer2 *)norm2)->hasBoundaryAfter(c);
}
U_CAPI UBool U_EXPORT2
unorm2_isInert(const UNormalizer2 *norm2, UChar32 c) {
return ((const Normalizer2 *)norm2)->isInert(c);
}
// Some properties APIs ---------------------------------------------------- ***
U_CAPI uint8_t U_EXPORT2
u_getCombiningClass(UChar32 c) {
UErrorCode errorCode=U_ZERO_ERROR;
const Normalizer2 *nfd=Normalizer2::getNFDInstance(errorCode);
if(U_SUCCESS(errorCode)) {
return nfd->getCombiningClass(c);
} else {
return 0;
}
}
U_CFUNC uint16_t
unorm_getFCD16(UChar32 c) {
UErrorCode errorCode=U_ZERO_ERROR;
const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
if(U_SUCCESS(errorCode)) {
return impl->getFCD16(c);
} else {
return 0;
}
}
#endif // !UCONFIG_NO_NORMALIZATION

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,988 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
*
* Copyright (C) 2009-2014, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: normalizer2impl.h
* encoding: UTF-8
* tab size: 8 (not used)
* indentation:4
*
* created on: 2009nov22
* created by: Markus W. Scherer
*/
#ifndef __NORMALIZER2IMPL_H__
#define __NORMALIZER2IMPL_H__
#include "unicode/utypes.h"
#if !UCONFIG_NO_NORMALIZATION
#include "unicode/normalizer2.h"
#include "unicode/ucptrie.h"
#include "unicode/unistr.h"
#include "unicode/unorm.h"
#include "unicode/utf.h"
#include "unicode/utf16.h"
#include "mutex.h"
#include "udataswp.h"
#include "uset_imp.h"
// When the nfc.nrm data is *not* hardcoded into the common library
// (with this constant set to 0),
// then it needs to be built into the data package:
// Add nfc.nrm to icu4c/source/data/Makefile.in DAT_FILES_SHORT
#define NORM2_HARDCODE_NFC_DATA 1
U_NAMESPACE_BEGIN
struct CanonIterData;
class ByteSink;
class Edits;
class InitCanonIterData;
class LcccContext;
class U_COMMON_API Hangul {
public:
/* Korean Hangul and Jamo constants */
enum {
JAMO_L_BASE=0x1100, /* "lead" jamo */
JAMO_L_END=0x1112,
JAMO_V_BASE=0x1161, /* "vowel" jamo */
JAMO_V_END=0x1175,
JAMO_T_BASE=0x11a7, /* "trail" jamo */
JAMO_T_END=0x11c2,
HANGUL_BASE=0xac00,
HANGUL_END=0xd7a3,
JAMO_L_COUNT=19,
JAMO_V_COUNT=21,
JAMO_T_COUNT=28,
JAMO_VT_COUNT=JAMO_V_COUNT*JAMO_T_COUNT,
HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT,
HANGUL_LIMIT=HANGUL_BASE+HANGUL_COUNT
};
static inline UBool isHangul(UChar32 c) {
return HANGUL_BASE<=c && c<HANGUL_LIMIT;
}
static inline UBool
isHangulLV(UChar32 c) {
c-=HANGUL_BASE;
return 0<=c && c<HANGUL_COUNT && c%JAMO_T_COUNT==0;
}
static inline UBool isJamoL(UChar32 c) {
return (uint32_t)(c-JAMO_L_BASE)<JAMO_L_COUNT;
}
static inline UBool isJamoV(UChar32 c) {
return (uint32_t)(c-JAMO_V_BASE)<JAMO_V_COUNT;
}
static inline UBool isJamoT(UChar32 c) {
int32_t t=c-JAMO_T_BASE;
return 0<t && t<JAMO_T_COUNT; // not JAMO_T_BASE itself
}
static UBool isJamo(UChar32 c) {
return JAMO_L_BASE<=c && c<=JAMO_T_END &&
(c<=JAMO_L_END || (JAMO_V_BASE<=c && c<=JAMO_V_END) || JAMO_T_BASE<c);
}
/**
* Decomposes c, which must be a Hangul syllable, into buffer
* and returns the length of the decomposition (2 or 3).
*/
static inline int32_t decompose(UChar32 c, char16_t buffer[3]) {
c-=HANGUL_BASE;
UChar32 c2=c%JAMO_T_COUNT;
c/=JAMO_T_COUNT;
buffer[0]=(char16_t)(JAMO_L_BASE+c/JAMO_V_COUNT);
buffer[1]=(char16_t)(JAMO_V_BASE+c%JAMO_V_COUNT);
if(c2==0) {
return 2;
} else {
buffer[2]=(char16_t)(JAMO_T_BASE+c2);
return 3;
}
}
/**
* Decomposes c, which must be a Hangul syllable, into buffer.
* This is the raw, not recursive, decomposition. Its length is always 2.
*/
static inline void getRawDecomposition(UChar32 c, char16_t buffer[2]) {
UChar32 orig=c;
c-=HANGUL_BASE;
UChar32 c2=c%JAMO_T_COUNT;
if(c2==0) {
c/=JAMO_T_COUNT;
buffer[0]=(char16_t)(JAMO_L_BASE+c/JAMO_V_COUNT);
buffer[1]=(char16_t)(JAMO_V_BASE+c%JAMO_V_COUNT);
} else {
buffer[0]=(char16_t)(orig-c2); // LV syllable
buffer[1]=(char16_t)(JAMO_T_BASE+c2);
}
}
private:
Hangul() = delete; // no instantiation
};
class Normalizer2Impl;
class U_COMMON_API ReorderingBuffer : public UMemory {
public:
/** Constructs only; init() should be called. */
ReorderingBuffer(const Normalizer2Impl &ni, UnicodeString &dest) :
impl(ni), str(dest),
start(nullptr), reorderStart(nullptr), limit(nullptr),
remainingCapacity(0), lastCC(0) {}
/** Constructs, removes the string contents, and initializes for a small initial capacity. */
ReorderingBuffer(const Normalizer2Impl &ni, UnicodeString &dest, UErrorCode &errorCode);
~ReorderingBuffer() {
if (start != nullptr) {
str.releaseBuffer((int32_t)(limit-start));
}
}
UBool init(int32_t destCapacity, UErrorCode &errorCode);
UBool isEmpty() const { return start==limit; }
int32_t length() const { return (int32_t)(limit-start); }
char16_t *getStart() { return start; }
char16_t *getLimit() { return limit; }
uint8_t getLastCC() const { return lastCC; }
UBool equals(const char16_t *start, const char16_t *limit) const;
UBool equals(const uint8_t *otherStart, const uint8_t *otherLimit) const;
UBool append(UChar32 c, uint8_t cc, UErrorCode &errorCode) {
return (c<=0xffff) ?
appendBMP((char16_t)c, cc, errorCode) :
appendSupplementary(c, cc, errorCode);
}
UBool append(const char16_t *s, int32_t length, UBool isNFD,
uint8_t leadCC, uint8_t trailCC,
UErrorCode &errorCode);
UBool appendBMP(char16_t c, uint8_t cc, UErrorCode &errorCode) {
if(remainingCapacity==0 && !resize(1, errorCode)) {
return false;
}
if(lastCC<=cc || cc==0) {
*limit++=c;
lastCC=cc;
if(cc<=1) {
reorderStart=limit;
}
} else {
insert(c, cc);
}
--remainingCapacity;
return true;
}
UBool appendZeroCC(UChar32 c, UErrorCode &errorCode);
UBool appendZeroCC(const char16_t *s, const char16_t *sLimit, UErrorCode &errorCode);
void remove();
void removeSuffix(int32_t suffixLength);
void setReorderingLimit(char16_t *newLimit) {
remainingCapacity+=(int32_t)(limit-newLimit);
reorderStart=limit=newLimit;
lastCC=0;
}
void copyReorderableSuffixTo(UnicodeString &s) const {
s.setTo(ConstChar16Ptr(reorderStart), (int32_t)(limit-reorderStart));
}
private:
/*
* TODO: Revisit whether it makes sense to track reorderStart.
* It is set to after the last known character with cc<=1,
* which stops previousCC() before it reads that character and looks up its cc.
* previousCC() is normally only called from insert().
* In other words, reorderStart speeds up the insertion of a combining mark
* into a multi-combining mark sequence where it does not belong at the end.
* This might not be worth the trouble.
* On the other hand, it's not a huge amount of trouble.
*
* We probably need it for UNORM_SIMPLE_APPEND.
*/
UBool appendSupplementary(UChar32 c, uint8_t cc, UErrorCode &errorCode);
void insert(UChar32 c, uint8_t cc);
static void writeCodePoint(char16_t *p, UChar32 c) {
if(c<=0xffff) {
*p=(char16_t)c;
} else {
p[0]=U16_LEAD(c);
p[1]=U16_TRAIL(c);
}
}
UBool resize(int32_t appendLength, UErrorCode &errorCode);
const Normalizer2Impl &impl;
UnicodeString &str;
char16_t *start, *reorderStart, *limit;
int32_t remainingCapacity;
uint8_t lastCC;
// private backward iterator
void setIterator() { codePointStart=limit; }
void skipPrevious(); // Requires start<codePointStart.
uint8_t previousCC(); // Returns 0 if there is no previous character.
char16_t *codePointStart, *codePointLimit;
};
/**
* Low-level implementation of the Unicode Normalization Algorithm.
* For the data structure and details see the documentation at the end of
* this normalizer2impl.h and in the design doc at
* https://icu.unicode.org/design/normalization/custom
*/
class U_COMMON_API Normalizer2Impl : public UObject {
public:
Normalizer2Impl() : normTrie(nullptr), fCanonIterData(nullptr) {}
virtual ~Normalizer2Impl();
void init(const int32_t *inIndexes, const UCPTrie *inTrie,
const uint16_t *inExtraData, const uint8_t *inSmallFCD);
void addLcccChars(UnicodeSet &set) const;
void addPropertyStarts(const USetAdder *sa, UErrorCode &errorCode) const;
void addCanonIterPropertyStarts(const USetAdder *sa, UErrorCode &errorCode) const;
// low-level properties ------------------------------------------------ ***
UBool ensureCanonIterData(UErrorCode &errorCode) const;
// The trie stores values for lead surrogate code *units*.
// Surrogate code *points* are inert.
uint16_t getNorm16(UChar32 c) const {
return U_IS_LEAD(c) ?
static_cast<uint16_t>(INERT) :
UCPTRIE_FAST_GET(normTrie, UCPTRIE_16, c);
}
uint16_t getRawNorm16(UChar32 c) const { return UCPTRIE_FAST_GET(normTrie, UCPTRIE_16, c); }
UNormalizationCheckResult getCompQuickCheck(uint16_t norm16) const {
if(norm16<minNoNo || MIN_YES_YES_WITH_CC<=norm16) {
return UNORM_YES;
} else if(minMaybeYes<=norm16) {
return UNORM_MAYBE;
} else {
return UNORM_NO;
}
}
UBool isAlgorithmicNoNo(uint16_t norm16) const { return limitNoNo<=norm16 && norm16<minMaybeYes; }
UBool isCompNo(uint16_t norm16) const { return minNoNo<=norm16 && norm16<minMaybeYes; }
UBool isDecompYes(uint16_t norm16) const { return norm16<minYesNo || minMaybeYes<=norm16; }
uint8_t getCC(uint16_t norm16) const {
if(norm16>=MIN_NORMAL_MAYBE_YES) {
return getCCFromNormalYesOrMaybe(norm16);
}
if(norm16<minNoNo || limitNoNo<=norm16) {
return 0;
}
return getCCFromNoNo(norm16);
}
static uint8_t getCCFromNormalYesOrMaybe(uint16_t norm16) {
return (uint8_t)(norm16 >> OFFSET_SHIFT);
}
static uint8_t getCCFromYesOrMaybe(uint16_t norm16) {
return norm16>=MIN_NORMAL_MAYBE_YES ? getCCFromNormalYesOrMaybe(norm16) : 0;
}
uint8_t getCCFromYesOrMaybeCP(UChar32 c) const {
if (c < minCompNoMaybeCP) { return 0; }
return getCCFromYesOrMaybe(getNorm16(c));
}
/**
* Returns the FCD data for code point c.
* @param c A Unicode code point.
* @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0.
*/
uint16_t getFCD16(UChar32 c) const {
if(c<minDecompNoCP) {
return 0;
} else if(c<=0xffff) {
if(!singleLeadMightHaveNonZeroFCD16(c)) { return 0; }
}
return getFCD16FromNormData(c);
}
/**
* Returns the FCD data for the next code point (post-increment).
* Might skip only a lead surrogate rather than the whole surrogate pair if none of
* the supplementary code points associated with the lead surrogate have non-zero FCD data.
* @param s A valid pointer into a string. Requires s!=limit.
* @param limit The end of the string, or NULL.
* @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0.
*/
uint16_t nextFCD16(const char16_t *&s, const char16_t *limit) const {
UChar32 c=*s++;
if(c<minDecompNoCP || !singleLeadMightHaveNonZeroFCD16(c)) {
return 0;
}
char16_t c2;
if(U16_IS_LEAD(c) && s!=limit && U16_IS_TRAIL(c2=*s)) {
c=U16_GET_SUPPLEMENTARY(c, c2);
++s;
}
return getFCD16FromNormData(c);
}
/**
* Returns the FCD data for the previous code point (pre-decrement).
* @param start The start of the string.
* @param s A valid pointer into a string. Requires start<s.
* @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0.
*/
uint16_t previousFCD16(const char16_t *start, const char16_t *&s) const {
UChar32 c=*--s;
if(c<minDecompNoCP) {
return 0;
}
if(!U16_IS_TRAIL(c)) {
if(!singleLeadMightHaveNonZeroFCD16(c)) {
return 0;
}
} else {
char16_t c2;
if(start<s && U16_IS_LEAD(c2=*(s-1))) {
c=U16_GET_SUPPLEMENTARY(c2, c);
--s;
}
}
return getFCD16FromNormData(c);
}
/** Returns true if the single-or-lead code unit c might have non-zero FCD data. */
UBool singleLeadMightHaveNonZeroFCD16(UChar32 lead) const {
// 0<=lead<=0xffff
uint8_t bits=smallFCD[lead>>8];
if(bits==0) { return false; }
return (UBool)((bits>>((lead>>5)&7))&1);
}
/** Returns the FCD value from the regular normalization data. */
uint16_t getFCD16FromNormData(UChar32 c) const;
/**
* Gets the decomposition for one code point.
* @param c code point
* @param buffer out-only buffer for algorithmic decompositions
* @param length out-only, takes the length of the decomposition, if any
* @return pointer to the decomposition, or NULL if none
*/
const char16_t *getDecomposition(UChar32 c, char16_t buffer[4], int32_t &length) const;
/**
* Gets the raw decomposition for one code point.
* @param c code point
* @param buffer out-only buffer for algorithmic decompositions
* @param length out-only, takes the length of the decomposition, if any
* @return pointer to the decomposition, or NULL if none
*/
const char16_t *getRawDecomposition(UChar32 c, char16_t buffer[30], int32_t &length) const;
UChar32 composePair(UChar32 a, UChar32 b) const;
UBool isCanonSegmentStarter(UChar32 c) const;
UBool getCanonStartSet(UChar32 c, UnicodeSet &set) const;
enum {
// Fixed norm16 values.
MIN_YES_YES_WITH_CC=0xfe02,
JAMO_VT=0xfe00,
MIN_NORMAL_MAYBE_YES=0xfc00,
JAMO_L=2, // offset=1 hasCompBoundaryAfter=false
INERT=1, // offset=0 hasCompBoundaryAfter=true
// norm16 bit 0 is comp-boundary-after.
HAS_COMP_BOUNDARY_AFTER=1,
OFFSET_SHIFT=1,
// For algorithmic one-way mappings, norm16 bits 2..1 indicate the
// tccc (0, 1, >1) for quick FCC boundary-after tests.
DELTA_TCCC_0=0,
DELTA_TCCC_1=2,
DELTA_TCCC_GT_1=4,
DELTA_TCCC_MASK=6,
DELTA_SHIFT=3,
MAX_DELTA=0x40
};
enum {
// Byte offsets from the start of the data, after the generic header.
IX_NORM_TRIE_OFFSET,
IX_EXTRA_DATA_OFFSET,
IX_SMALL_FCD_OFFSET,
IX_RESERVED3_OFFSET,
IX_RESERVED4_OFFSET,
IX_RESERVED5_OFFSET,
IX_RESERVED6_OFFSET,
IX_TOTAL_SIZE,
// Code point thresholds for quick check codes.
IX_MIN_DECOMP_NO_CP,
IX_MIN_COMP_NO_MAYBE_CP,
// Norm16 value thresholds for quick check combinations and types of extra data.
/** Mappings & compositions in [minYesNo..minYesNoMappingsOnly[. */
IX_MIN_YES_NO,
/** Mappings are comp-normalized. */
IX_MIN_NO_NO,
IX_LIMIT_NO_NO,
IX_MIN_MAYBE_YES,
/** Mappings only in [minYesNoMappingsOnly..minNoNo[. */
IX_MIN_YES_NO_MAPPINGS_ONLY,
/** Mappings are not comp-normalized but have a comp boundary before. */
IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE,
/** Mappings do not have a comp boundary before. */
IX_MIN_NO_NO_COMP_NO_MAYBE_CC,
/** Mappings to the empty string. */
IX_MIN_NO_NO_EMPTY,
IX_MIN_LCCC_CP,
IX_RESERVED19,
IX_COUNT
};
enum {
MAPPING_HAS_CCC_LCCC_WORD=0x80,
MAPPING_HAS_RAW_MAPPING=0x40,
// unused bit 0x20,
MAPPING_LENGTH_MASK=0x1f
};
enum {
COMP_1_LAST_TUPLE=0x8000,
COMP_1_TRIPLE=1,
COMP_1_TRAIL_LIMIT=0x3400,
COMP_1_TRAIL_MASK=0x7ffe,
COMP_1_TRAIL_SHIFT=9, // 10-1 for the "triple" bit
COMP_2_TRAIL_SHIFT=6,
COMP_2_TRAIL_MASK=0xffc0
};
// higher-level functionality ------------------------------------------ ***
// NFD without an NFD Normalizer2 instance.
UnicodeString &decompose(const UnicodeString &src, UnicodeString &dest,
UErrorCode &errorCode) const;
/**
* Decomposes [src, limit[ and writes the result to dest.
* limit can be NULL if src is NUL-terminated.
* destLengthEstimate is the initial dest buffer capacity and can be -1.
*/
void decompose(const char16_t *src, const char16_t *limit,
UnicodeString &dest, int32_t destLengthEstimate,
UErrorCode &errorCode) const;
const char16_t *decompose(const char16_t *src, const char16_t *limit,
ReorderingBuffer *buffer, UErrorCode &errorCode) const;
void decomposeAndAppend(const char16_t *src, const char16_t *limit,
UBool doDecompose,
UnicodeString &safeMiddle,
ReorderingBuffer &buffer,
UErrorCode &errorCode) const;
/** sink==nullptr: isNormalized()/spanQuickCheckYes() */
const uint8_t *decomposeUTF8(uint32_t options,
const uint8_t *src, const uint8_t *limit,
ByteSink *sink, Edits *edits, UErrorCode &errorCode) const;
UBool compose(const char16_t *src, const char16_t *limit,
UBool onlyContiguous,
UBool doCompose,
ReorderingBuffer &buffer,
UErrorCode &errorCode) const;
const char16_t *composeQuickCheck(const char16_t *src, const char16_t *limit,
UBool onlyContiguous,
UNormalizationCheckResult *pQCResult) const;
void composeAndAppend(const char16_t *src, const char16_t *limit,
UBool doCompose,
UBool onlyContiguous,
UnicodeString &safeMiddle,
ReorderingBuffer &buffer,
UErrorCode &errorCode) const;
/** sink==nullptr: isNormalized() */
UBool composeUTF8(uint32_t options, UBool onlyContiguous,
const uint8_t *src, const uint8_t *limit,
ByteSink *sink, icu::Edits *edits, UErrorCode &errorCode) const;
const char16_t *makeFCD(const char16_t *src, const char16_t *limit,
ReorderingBuffer *buffer, UErrorCode &errorCode) const;
void makeFCDAndAppend(const char16_t *src, const char16_t *limit,
UBool doMakeFCD,
UnicodeString &safeMiddle,
ReorderingBuffer &buffer,
UErrorCode &errorCode) const;
UBool hasDecompBoundaryBefore(UChar32 c) const;
UBool norm16HasDecompBoundaryBefore(uint16_t norm16) const;
UBool hasDecompBoundaryAfter(UChar32 c) const;
UBool norm16HasDecompBoundaryAfter(uint16_t norm16) const;
UBool isDecompInert(UChar32 c) const { return isDecompYesAndZeroCC(getNorm16(c)); }
UBool hasCompBoundaryBefore(UChar32 c) const {
return c<minCompNoMaybeCP || norm16HasCompBoundaryBefore(getNorm16(c));
}
UBool hasCompBoundaryAfter(UChar32 c, UBool onlyContiguous) const {
return norm16HasCompBoundaryAfter(getNorm16(c), onlyContiguous);
}
UBool isCompInert(UChar32 c, UBool onlyContiguous) const {
uint16_t norm16=getNorm16(c);
return isCompYesAndZeroCC(norm16) &&
(norm16 & HAS_COMP_BOUNDARY_AFTER) != 0 &&
(!onlyContiguous || isInert(norm16) || *getMapping(norm16) <= 0x1ff);
}
UBool hasFCDBoundaryBefore(UChar32 c) const { return hasDecompBoundaryBefore(c); }
UBool hasFCDBoundaryAfter(UChar32 c) const { return hasDecompBoundaryAfter(c); }
UBool isFCDInert(UChar32 c) const { return getFCD16(c)<=1; }
private:
friend class InitCanonIterData;
friend class LcccContext;
UBool isMaybe(uint16_t norm16) const { return minMaybeYes<=norm16 && norm16<=JAMO_VT; }
UBool isMaybeOrNonZeroCC(uint16_t norm16) const { return norm16>=minMaybeYes; }
static UBool isInert(uint16_t norm16) { return norm16==INERT; }
static UBool isJamoL(uint16_t norm16) { return norm16==JAMO_L; }
static UBool isJamoVT(uint16_t norm16) { return norm16==JAMO_VT; }
uint16_t hangulLVT() const { return minYesNoMappingsOnly|HAS_COMP_BOUNDARY_AFTER; }
UBool isHangulLV(uint16_t norm16) const { return norm16==minYesNo; }
UBool isHangulLVT(uint16_t norm16) const {
return norm16==hangulLVT();
}
UBool isCompYesAndZeroCC(uint16_t norm16) const { return norm16<minNoNo; }
// UBool isCompYes(uint16_t norm16) const {
// return norm16>=MIN_YES_YES_WITH_CC || norm16<minNoNo;
// }
// UBool isCompYesOrMaybe(uint16_t norm16) const {
// return norm16<minNoNo || minMaybeYes<=norm16;
// }
// UBool hasZeroCCFromDecompYes(uint16_t norm16) const {
// return norm16<=MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT;
// }
UBool isDecompYesAndZeroCC(uint16_t norm16) const {
return norm16<minYesNo ||
norm16==JAMO_VT ||
(minMaybeYes<=norm16 && norm16<=MIN_NORMAL_MAYBE_YES);
}
/**
* A little faster and simpler than isDecompYesAndZeroCC() but does not include
* the MaybeYes which combine-forward and have ccc=0.
* (Standard Unicode 10 normalization does not have such characters.)
*/
UBool isMostDecompYesAndZeroCC(uint16_t norm16) const {
return norm16<minYesNo || norm16==MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT;
}
UBool isDecompNoAlgorithmic(uint16_t norm16) const { return norm16>=limitNoNo; }
// For use with isCompYes().
// Perhaps the compiler can combine the two tests for MIN_YES_YES_WITH_CC.
// static uint8_t getCCFromYes(uint16_t norm16) {
// return norm16>=MIN_YES_YES_WITH_CC ? getCCFromNormalYesOrMaybe(norm16) : 0;
// }
uint8_t getCCFromNoNo(uint16_t norm16) const {
const uint16_t *mapping=getMapping(norm16);
if(*mapping&MAPPING_HAS_CCC_LCCC_WORD) {
return (uint8_t)*(mapping-1);
} else {
return 0;
}
}
// requires that the [cpStart..cpLimit[ character passes isCompYesAndZeroCC()
uint8_t getTrailCCFromCompYesAndZeroCC(uint16_t norm16) const {
if(norm16<=minYesNo) {
return 0; // yesYes and Hangul LV have ccc=tccc=0
} else {
// For Hangul LVT we harmlessly fetch a firstUnit with tccc=0 here.
return (uint8_t)(*getMapping(norm16)>>8); // tccc from yesNo
}
}
uint8_t getPreviousTrailCC(const char16_t *start, const char16_t *p) const;
uint8_t getPreviousTrailCC(const uint8_t *start, const uint8_t *p) const;
// Requires algorithmic-NoNo.
UChar32 mapAlgorithmic(UChar32 c, uint16_t norm16) const {
return c+(norm16>>DELTA_SHIFT)-centerNoNoDelta;
}
UChar32 getAlgorithmicDelta(uint16_t norm16) const {
return (norm16>>DELTA_SHIFT)-centerNoNoDelta;
}
// Requires minYesNo<norm16<limitNoNo.
const uint16_t *getMapping(uint16_t norm16) const { return extraData+(norm16>>OFFSET_SHIFT); }
const uint16_t *getCompositionsListForDecompYes(uint16_t norm16) const {
if(norm16<JAMO_L || MIN_NORMAL_MAYBE_YES<=norm16) {
return nullptr;
} else if(norm16<minMaybeYes) {
return getMapping(norm16); // for yesYes; if Jamo L: harmless empty list
} else {
return maybeYesCompositions+norm16-minMaybeYes;
}
}
const uint16_t *getCompositionsListForComposite(uint16_t norm16) const {
// A composite has both mapping & compositions list.
const uint16_t *list=getMapping(norm16);
return list+ // mapping pointer
1+ // +1 to skip the first unit with the mapping length
(*list&MAPPING_LENGTH_MASK); // + mapping length
}
const uint16_t *getCompositionsListForMaybe(uint16_t norm16) const {
// minMaybeYes<=norm16<MIN_NORMAL_MAYBE_YES
return maybeYesCompositions+((norm16-minMaybeYes)>>OFFSET_SHIFT);
}
/**
* @param c code point must have compositions
* @return compositions list pointer
*/
const uint16_t *getCompositionsList(uint16_t norm16) const {
return isDecompYes(norm16) ?
getCompositionsListForDecompYes(norm16) :
getCompositionsListForComposite(norm16);
}
const char16_t *copyLowPrefixFromNulTerminated(const char16_t *src,
UChar32 minNeedDataCP,
ReorderingBuffer *buffer,
UErrorCode &errorCode) const;
enum StopAt { STOP_AT_LIMIT, STOP_AT_DECOMP_BOUNDARY, STOP_AT_COMP_BOUNDARY };
const char16_t *decomposeShort(const char16_t *src, const char16_t *limit,
UBool stopAtCompBoundary, UBool onlyContiguous,
ReorderingBuffer &buffer, UErrorCode &errorCode) const;
UBool decompose(UChar32 c, uint16_t norm16,
ReorderingBuffer &buffer, UErrorCode &errorCode) const;
const uint8_t *decomposeShort(const uint8_t *src, const uint8_t *limit,
StopAt stopAt, UBool onlyContiguous,
ReorderingBuffer &buffer, UErrorCode &errorCode) const;
static int32_t combine(const uint16_t *list, UChar32 trail);
void addComposites(const uint16_t *list, UnicodeSet &set) const;
void recompose(ReorderingBuffer &buffer, int32_t recomposeStartIndex,
UBool onlyContiguous) const;
UBool hasCompBoundaryBefore(UChar32 c, uint16_t norm16) const {
return c<minCompNoMaybeCP || norm16HasCompBoundaryBefore(norm16);
}
UBool norm16HasCompBoundaryBefore(uint16_t norm16) const {
return norm16 < minNoNoCompNoMaybeCC || isAlgorithmicNoNo(norm16);
}
UBool hasCompBoundaryBefore(const char16_t *src, const char16_t *limit) const;
UBool hasCompBoundaryBefore(const uint8_t *src, const uint8_t *limit) const;
UBool hasCompBoundaryAfter(const char16_t *start, const char16_t *p,
UBool onlyContiguous) const;
UBool hasCompBoundaryAfter(const uint8_t *start, const uint8_t *p,
UBool onlyContiguous) const;
UBool norm16HasCompBoundaryAfter(uint16_t norm16, UBool onlyContiguous) const {
return (norm16 & HAS_COMP_BOUNDARY_AFTER) != 0 &&
(!onlyContiguous || isTrailCC01ForCompBoundaryAfter(norm16));
}
/** For FCC: Given norm16 HAS_COMP_BOUNDARY_AFTER, does it have tccc<=1? */
UBool isTrailCC01ForCompBoundaryAfter(uint16_t norm16) const {
return isInert(norm16) || (isDecompNoAlgorithmic(norm16) ?
(norm16 & DELTA_TCCC_MASK) <= DELTA_TCCC_1 : *getMapping(norm16) <= 0x1ff);
}
const char16_t *findPreviousCompBoundary(const char16_t *start, const char16_t *p, UBool onlyContiguous) const;
const char16_t *findNextCompBoundary(const char16_t *p, const char16_t *limit, UBool onlyContiguous) const;
const char16_t *findPreviousFCDBoundary(const char16_t *start, const char16_t *p) const;
const char16_t *findNextFCDBoundary(const char16_t *p, const char16_t *limit) const;
void makeCanonIterDataFromNorm16(UChar32 start, UChar32 end, const uint16_t norm16,
CanonIterData &newData, UErrorCode &errorCode) const;
int32_t getCanonValue(UChar32 c) const;
const UnicodeSet &getCanonStartSet(int32_t n) const;
// UVersionInfo dataVersion;
// BMP code point thresholds for quick check loops looking at single UTF-16 code units.
char16_t minDecompNoCP;
char16_t minCompNoMaybeCP;
char16_t minLcccCP;
// Norm16 value thresholds for quick check combinations and types of extra data.
uint16_t minYesNo;
uint16_t minYesNoMappingsOnly;
uint16_t minNoNo;
uint16_t minNoNoCompBoundaryBefore;
uint16_t minNoNoCompNoMaybeCC;
uint16_t minNoNoEmpty;
uint16_t limitNoNo;
uint16_t centerNoNoDelta;
uint16_t minMaybeYes;
const UCPTrie *normTrie;
const uint16_t *maybeYesCompositions;
const uint16_t *extraData; // mappings and/or compositions for yesYes, yesNo & noNo characters
const uint8_t *smallFCD; // [0x100] one bit per 32 BMP code points, set if any FCD!=0
UInitOnce fCanonIterDataInitOnce {};
CanonIterData *fCanonIterData;
};
// bits in canonIterData
#define CANON_NOT_SEGMENT_STARTER 0x80000000
#define CANON_HAS_COMPOSITIONS 0x40000000
#define CANON_HAS_SET 0x200000
#define CANON_VALUE_MASK 0x1fffff
/**
* ICU-internal shortcut for quick access to standard Unicode normalization.
*/
class U_COMMON_API Normalizer2Factory {
public:
static const Normalizer2 *getFCDInstance(UErrorCode &errorCode);
static const Normalizer2 *getFCCInstance(UErrorCode &errorCode);
static const Normalizer2 *getNoopInstance(UErrorCode &errorCode);
static const Normalizer2 *getInstance(UNormalizationMode mode, UErrorCode &errorCode);
static const Normalizer2Impl *getNFCImpl(UErrorCode &errorCode);
static const Normalizer2Impl *getNFKCImpl(UErrorCode &errorCode);
static const Normalizer2Impl *getNFKC_CFImpl(UErrorCode &errorCode);
// Get the Impl instance of the Normalizer2.
// Must be used only when it is known that norm2 is a Normalizer2WithImpl instance.
static const Normalizer2Impl *getImpl(const Normalizer2 *norm2);
private:
Normalizer2Factory() = delete; // No instantiation.
};
U_NAMESPACE_END
U_CAPI int32_t U_EXPORT2
unorm2_swap(const UDataSwapper *ds,
const void *inData, int32_t length, void *outData,
UErrorCode *pErrorCode);
/**
* Get the NF*_QC property for a code point, for u_getIntPropertyValue().
* @internal
*/
U_CFUNC UNormalizationCheckResult
unorm_getQuickCheck(UChar32 c, UNormalizationMode mode);
/**
* Gets the 16-bit FCD value (lead & trail CCs) for a code point, for u_getIntPropertyValue().
* @internal
*/
U_CFUNC uint16_t
unorm_getFCD16(UChar32 c);
/**
* Format of Normalizer2 .nrm data files.
* Format version 4.0.
*
* Normalizer2 .nrm data files provide data for the Unicode Normalization algorithms.
* ICU ships with data files for standard Unicode Normalization Forms
* NFC and NFD (nfc.nrm), NFKC and NFKD (nfkc.nrm),
* NFKC_Casefold (nfkc_cf.nrm) and NFKC_Simple_Casefold (nfkc_scf.nrm).
* Custom (application-specific) data can be built into additional .nrm files
* with the gennorm2 build tool.
* ICU ships with one such file, uts46.nrm, for the implementation of UTS #46.
*
* Normalizer2.getInstance() causes a .nrm file to be loaded, unless it has been
* cached already. Internally, Normalizer2Impl.load() reads the .nrm file.
*
* A .nrm file begins with a standard ICU data file header
* (DataHeader, see ucmndata.h and unicode/udata.h).
* The UDataInfo.dataVersion field usually contains the Unicode version
* for which the data was generated.
*
* After the header, the file contains the following parts.
* Constants are defined as enum values of the Normalizer2Impl class.
*
* Many details of the data structures are described in the design doc
* which is at https://icu.unicode.org/design/normalization/custom
*
* int32_t indexes[indexesLength]; -- indexesLength=indexes[IX_NORM_TRIE_OFFSET]/4;
*
* The first eight indexes are byte offsets in ascending order.
* Each byte offset marks the start of the next part in the data file,
* and the end of the previous one.
* When two consecutive byte offsets are the same, then the corresponding part is empty.
* Byte offsets are offsets from after the header,
* that is, from the beginning of the indexes[].
* Each part starts at an offset with proper alignment for its data.
* If necessary, the previous part may include padding bytes to achieve this alignment.
*
* minDecompNoCP=indexes[IX_MIN_DECOMP_NO_CP] is the lowest code point
* with a decomposition mapping, that is, with NF*D_QC=No.
* minCompNoMaybeCP=indexes[IX_MIN_COMP_NO_MAYBE_CP] is the lowest code point
* with NF*C_QC=No (has a one-way mapping) or Maybe (combines backward).
* minLcccCP=indexes[IX_MIN_LCCC_CP] (index 18, new in formatVersion 3)
* is the lowest code point with lccc!=0.
*
* The next eight indexes are thresholds of 16-bit trie values for ranges of
* values indicating multiple normalization properties.
* They are listed here in threshold order, not in the order they are stored in the indexes.
* minYesNo=indexes[IX_MIN_YES_NO];
* minYesNoMappingsOnly=indexes[IX_MIN_YES_NO_MAPPINGS_ONLY];
* minNoNo=indexes[IX_MIN_NO_NO];
* minNoNoCompBoundaryBefore=indexes[IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE];
* minNoNoCompNoMaybeCC=indexes[IX_MIN_NO_NO_COMP_NO_MAYBE_CC];
* minNoNoEmpty=indexes[IX_MIN_NO_NO_EMPTY];
* limitNoNo=indexes[IX_LIMIT_NO_NO];
* minMaybeYes=indexes[IX_MIN_MAYBE_YES];
* See the normTrie description below and the design doc for details.
*
* UCPTrie normTrie; -- see ucptrie_impl.h and ucptrie.h, same as Java CodePointTrie
*
* The trie holds the main normalization data. Each code point is mapped to a 16-bit value.
* Rather than using independent bits in the value (which would require more than 16 bits),
* information is extracted primarily via range checks.
* Except, format version 3 uses bit 0 for hasCompBoundaryAfter().
* For example, a 16-bit value norm16 in the range minYesNo<=norm16<minNoNo
* means that the character has NF*C_QC=Yes and NF*D_QC=No properties,
* which means it has a two-way (round-trip) decomposition mapping.
* Values in the range 2<=norm16<limitNoNo are also directly indexes into the extraData
* pointing to mappings, compositions lists, or both.
* Value norm16==INERT (0 in versions 1 & 2, 1 in version 3)
* means that the character is normalization-inert, that is,
* it does not have a mapping, does not participate in composition, has a zero
* canonical combining class, and forms a boundary where text before it and after it
* can be normalized independently.
* For details about how multiple properties are encoded in 16-bit values
* see the design doc.
* Note that the encoding cannot express all combinations of the properties involved;
* it only supports those combinations that are allowed by
* the Unicode Normalization algorithms. Details are in the design doc as well.
* The gennorm2 tool only builds .nrm files for data that conforms to the limitations.
*
* The trie has a value for each lead surrogate code unit representing the "worst case"
* properties of the 1024 supplementary characters whose UTF-16 form starts with
* the lead surrogate. If all of the 1024 supplementary characters are normalization-inert,
* then their lead surrogate code unit has the trie value INERT.
* When the lead surrogate unit's value exceeds the quick check minimum during processing,
* the properties for the full supplementary code point need to be looked up.
*
* uint16_t maybeYesCompositions[MIN_NORMAL_MAYBE_YES-minMaybeYes];
* uint16_t extraData[];
*
* There is only one byte offset for the end of these two arrays.
* The split between them is given by the constant and variable mentioned above.
* In version 3, the difference must be shifted right by OFFSET_SHIFT.
*
* The maybeYesCompositions array contains compositions lists for characters that
* combine both forward (as starters in composition pairs)
* and backward (as trailing characters in composition pairs).
* Such characters do not occur in Unicode 5.2 but are allowed by
* the Unicode Normalization algorithms.
* If there are no such characters, then minMaybeYes==MIN_NORMAL_MAYBE_YES
* and the maybeYesCompositions array is empty.
* If there are such characters, then minMaybeYes is subtracted from their norm16 values
* to get the index into this array.
*
* The extraData array contains compositions lists for "YesYes" characters,
* followed by mappings and optional compositions lists for "YesNo" characters,
* followed by only mappings for "NoNo" characters.
* (Referring to pairs of NFC/NFD quick check values.)
* The norm16 values of those characters are directly indexes into the extraData array.
* In version 3, the norm16 values must be shifted right by OFFSET_SHIFT
* for accessing extraData.
*
* The data structures for compositions lists and mappings are described in the design doc.
*
* uint8_t smallFCD[0x100]; -- new in format version 2
*
* This is a bit set to help speed up FCD value lookups in the absence of a full
* UTrie2 or other large data structure with the full FCD value mapping.
*
* Each smallFCD bit is set if any of the corresponding 32 BMP code points
* has a non-zero FCD value (lccc!=0 or tccc!=0).
* Bit 0 of smallFCD[0] is for U+0000..U+001F. Bit 7 of smallFCD[0xff] is for U+FFE0..U+FFFF.
* A bit for 32 lead surrogates is set if any of the 32k corresponding
* _supplementary_ code points has a non-zero FCD value.
*
* This bit set is most useful for the large blocks of CJK characters with FCD=0.
*
* Changes from format version 1 to format version 2 ---------------------------
*
* - Addition of data for raw (not recursively decomposed) mappings.
* + The MAPPING_NO_COMP_BOUNDARY_AFTER bit in the extraData is now also set when
* the mapping is to an empty string or when the character combines-forward.
* This subsumes the one actual use of the MAPPING_PLUS_COMPOSITION_LIST bit which
* is then repurposed for the MAPPING_HAS_RAW_MAPPING bit.
* + For details see the design doc.
* - Addition of indexes[IX_MIN_YES_NO_MAPPINGS_ONLY] and separation of the yesNo extraData into
* distinct ranges (combines-forward vs. not)
* so that a range check can be used to find out if there is a compositions list.
* This is fully equivalent with formatVersion 1's MAPPING_PLUS_COMPOSITION_LIST flag.
* It is needed for the new (in ICU 49) composePair(), not for other normalization.
* - Addition of the smallFCD[] bit set.
*
* Changes from format version 2 to format version 3 (ICU 60) ------------------
*
* - norm16 bit 0 indicates hasCompBoundaryAfter(),
* except that for contiguous composition (FCC) the tccc must be checked as well.
* Data indexes and ccc values are shifted left by one (OFFSET_SHIFT).
* Thresholds like minNoNo are tested before shifting.
*
* - Algorithmic mapping deltas are shifted left by two more bits (total DELTA_SHIFT),
* to make room for two bits (three values) indicating whether the tccc is 0, 1, or greater.
* See DELTA_TCCC_MASK etc.
* This helps with fetching tccc/FCD values and FCC hasCompBoundaryAfter().
* minMaybeYes is 8-aligned so that the DELTA_TCCC_MASK bits can be tested directly.
*
* - Algorithmic mappings are only used for mapping to "comp yes and ccc=0" characters,
* and ASCII characters are mapped algorithmically only to other ASCII characters.
* This helps with hasCompBoundaryBefore() and compose() fast paths.
* It is never necessary any more to loop for algorithmic mappings.
*
* - Addition of indexes[IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE],
* indexes[IX_MIN_NO_NO_COMP_NO_MAYBE_CC], and indexes[IX_MIN_NO_NO_EMPTY],
* and separation of the noNo extraData into distinct ranges.
* With this, the noNo norm16 value indicates whether the mapping is
* compose-normalized, not normalized but hasCompBoundaryBefore(),
* not even that, or maps to an empty string.
* hasCompBoundaryBefore() can be determined solely from the norm16 value.
*
* - The norm16 value for Hangul LVT is now different from that for Hangul LV,
* so that hasCompBoundaryAfter() need not check for the syllable type.
* For Hangul LV, minYesNo continues to be used (no comp-boundary-after).
* For Hangul LVT, minYesNoMappingsOnly|HAS_COMP_BOUNDARY_AFTER is used.
* The extraData units at these indexes are set to firstUnit=2 and firstUnit=3, respectively,
* to simplify some code.
*
* - The extraData firstUnit bit 5 is no longer necessary
* (norm16 bit 0 used instead of firstUnit MAPPING_NO_COMP_BOUNDARY_AFTER),
* is reserved again, and always set to 0.
*
* - Addition of indexes[IX_MIN_LCCC_CP], the first code point where lccc!=0.
* This used to be hardcoded to U+0300, but in data like NFKC_Casefold it is lower:
* U+00AD Soft Hyphen maps to an empty string,
* which is artificially assigned "worst case" values lccc=1 and tccc=255.
*
* - A mapping to an empty string has explicit lccc=1 and tccc=255 values.
*
* Changes from format version 3 to format version 4 (ICU 63) ------------------
*
* Switched from UTrie2 to UCPTrie/CodePointTrie.
*
* The new trie no longer stores different values for surrogate code *units* vs.
* surrogate code *points*.
* Lead surrogates still have values for optimized UTF-16 string processing.
* When looking up code point properties, the code now checks for lead surrogates and
* treats them as inert.
*
* gennorm2 now has to reject mappings for surrogate code points.
* UTS #46 maps unpaired surrogates to U+FFFD in code rather than via its
* custom normalization data file.
*/
#endif /* !UCONFIG_NO_NORMALIZATION */
#endif /* __NORMALIZER2IMPL_H__ */

View file

@ -0,0 +1,529 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*************************************************************************
* COPYRIGHT:
* Copyright (c) 1996-2012, International Business Machines Corporation and
* others. All Rights Reserved.
*************************************************************************
*/
#include "unicode/utypes.h"
#if !UCONFIG_NO_NORMALIZATION
#include "unicode/uniset.h"
#include "unicode/unistr.h"
#include "unicode/chariter.h"
#include "unicode/schriter.h"
#include "unicode/uchriter.h"
#include "unicode/normlzr.h"
#include "unicode/utf16.h"
#include "cmemory.h"
#include "normalizer2impl.h"
#include "uprops.h" // for uniset_getUnicode32Instance()
#if defined(move32)
// System can define move32 intrinsics, but the char iters define move32 method
// using same undef trick in headers, so undef here to re-enable the method.
#undef move32
#endif
U_NAMESPACE_BEGIN
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(Normalizer)
//-------------------------------------------------------------------------
// Constructors and other boilerplate
//-------------------------------------------------------------------------
Normalizer::Normalizer(const UnicodeString& str, UNormalizationMode mode) :
UObject(), fFilteredNorm2(nullptr), fNorm2(nullptr), fUMode(mode), fOptions(0),
text(new StringCharacterIterator(str)),
currentIndex(0), nextIndex(0),
buffer(), bufferPos(0)
{
init();
}
Normalizer::Normalizer(ConstChar16Ptr str, int32_t length, UNormalizationMode mode) :
UObject(), fFilteredNorm2(nullptr), fNorm2(nullptr), fUMode(mode), fOptions(0),
text(new UCharCharacterIterator(str, length)),
currentIndex(0), nextIndex(0),
buffer(), bufferPos(0)
{
init();
}
Normalizer::Normalizer(const CharacterIterator& iter, UNormalizationMode mode) :
UObject(), fFilteredNorm2(nullptr), fNorm2(nullptr), fUMode(mode), fOptions(0),
text(iter.clone()),
currentIndex(0), nextIndex(0),
buffer(), bufferPos(0)
{
init();
}
Normalizer::Normalizer(const Normalizer &copy) :
UObject(copy), fFilteredNorm2(nullptr), fNorm2(nullptr), fUMode(copy.fUMode), fOptions(copy.fOptions),
text(copy.text->clone()),
currentIndex(copy.currentIndex), nextIndex(copy.nextIndex),
buffer(copy.buffer), bufferPos(copy.bufferPos)
{
init();
}
void
Normalizer::init() {
UErrorCode errorCode=U_ZERO_ERROR;
fNorm2=Normalizer2Factory::getInstance(fUMode, errorCode);
if(fOptions&UNORM_UNICODE_3_2) {
delete fFilteredNorm2;
fNorm2=fFilteredNorm2=
new FilteredNormalizer2(*fNorm2, *uniset_getUnicode32Instance(errorCode));
}
if(U_FAILURE(errorCode)) {
errorCode=U_ZERO_ERROR;
fNorm2=Normalizer2Factory::getNoopInstance(errorCode);
}
}
Normalizer::~Normalizer()
{
delete fFilteredNorm2;
delete text;
}
Normalizer*
Normalizer::clone() const
{
return new Normalizer(*this);
}
/**
* Generates a hash code for this iterator.
*/
int32_t Normalizer::hashCode() const
{
return text->hashCode() + fUMode + fOptions + buffer.hashCode() + bufferPos + currentIndex + nextIndex;
}
bool Normalizer::operator==(const Normalizer& that) const
{
return
this==&that ||
(fUMode==that.fUMode &&
fOptions==that.fOptions &&
*text==*that.text &&
buffer==that.buffer &&
bufferPos==that.bufferPos &&
nextIndex==that.nextIndex);
}
//-------------------------------------------------------------------------
// Static utility methods
//-------------------------------------------------------------------------
void U_EXPORT2
Normalizer::normalize(const UnicodeString& source,
UNormalizationMode mode, int32_t options,
UnicodeString& result,
UErrorCode &status) {
if(source.isBogus() || U_FAILURE(status)) {
result.setToBogus();
if(U_SUCCESS(status)) {
status=U_ILLEGAL_ARGUMENT_ERROR;
}
} else {
UnicodeString localDest;
UnicodeString *dest;
if(&source!=&result) {
dest=&result;
} else {
// the source and result strings are the same object, use a temporary one
dest=&localDest;
}
const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
if(U_SUCCESS(status)) {
if(options&UNORM_UNICODE_3_2) {
FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
normalize(source, *dest, status);
} else {
n2->normalize(source, *dest, status);
}
}
if(dest==&localDest && U_SUCCESS(status)) {
result=*dest;
}
}
}
void U_EXPORT2
Normalizer::compose(const UnicodeString& source,
UBool compat, int32_t options,
UnicodeString& result,
UErrorCode &status) {
normalize(source, compat ? UNORM_NFKC : UNORM_NFC, options, result, status);
}
void U_EXPORT2
Normalizer::decompose(const UnicodeString& source,
UBool compat, int32_t options,
UnicodeString& result,
UErrorCode &status) {
normalize(source, compat ? UNORM_NFKD : UNORM_NFD, options, result, status);
}
UNormalizationCheckResult
Normalizer::quickCheck(const UnicodeString& source,
UNormalizationMode mode, int32_t options,
UErrorCode &status) {
const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
if(U_SUCCESS(status)) {
if(options&UNORM_UNICODE_3_2) {
return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
quickCheck(source, status);
} else {
return n2->quickCheck(source, status);
}
} else {
return UNORM_MAYBE;
}
}
UBool
Normalizer::isNormalized(const UnicodeString& source,
UNormalizationMode mode, int32_t options,
UErrorCode &status) {
const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
if(U_SUCCESS(status)) {
if(options&UNORM_UNICODE_3_2) {
return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
isNormalized(source, status);
} else {
return n2->isNormalized(source, status);
}
} else {
return false;
}
}
UnicodeString & U_EXPORT2
Normalizer::concatenate(const UnicodeString &left, const UnicodeString &right,
UnicodeString &result,
UNormalizationMode mode, int32_t options,
UErrorCode &errorCode) {
if(left.isBogus() || right.isBogus() || U_FAILURE(errorCode)) {
result.setToBogus();
if(U_SUCCESS(errorCode)) {
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
}
} else {
UnicodeString localDest;
UnicodeString *dest;
if(&right!=&result) {
dest=&result;
} else {
// the right and result strings are the same object, use a temporary one
dest=&localDest;
}
*dest=left;
const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, errorCode);
if(U_SUCCESS(errorCode)) {
if(options&UNORM_UNICODE_3_2) {
FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(errorCode)).
append(*dest, right, errorCode);
} else {
n2->append(*dest, right, errorCode);
}
}
if(dest==&localDest && U_SUCCESS(errorCode)) {
result=*dest;
}
}
return result;
}
//-------------------------------------------------------------------------
// Iteration API
//-------------------------------------------------------------------------
/**
* Return the current character in the normalized text.
*/
UChar32 Normalizer::current() {
if(bufferPos<buffer.length() || nextNormalize()) {
return buffer.char32At(bufferPos);
} else {
return DONE;
}
}
/**
* Return the next character in the normalized text and advance
* the iteration position by one. If the end
* of the text has already been reached, {@link #DONE} is returned.
*/
UChar32 Normalizer::next() {
if(bufferPos<buffer.length() || nextNormalize()) {
UChar32 c=buffer.char32At(bufferPos);
bufferPos+=U16_LENGTH(c);
return c;
} else {
return DONE;
}
}
/**
* Return the previous character in the normalized text and decrement
* the iteration position by one. If the beginning
* of the text has already been reached, {@link #DONE} is returned.
*/
UChar32 Normalizer::previous() {
if(bufferPos>0 || previousNormalize()) {
UChar32 c=buffer.char32At(bufferPos-1);
bufferPos-=U16_LENGTH(c);
return c;
} else {
return DONE;
}
}
void Normalizer::reset() {
currentIndex=nextIndex=text->setToStart();
clearBuffer();
}
void
Normalizer::setIndexOnly(int32_t index) {
text->setIndex(index); // pins index
currentIndex=nextIndex=text->getIndex();
clearBuffer();
}
/**
* Return the first character in the normalized text. This resets
* the <tt>Normalizer's</tt> position to the beginning of the text.
*/
UChar32 Normalizer::first() {
reset();
return next();
}
/**
* Return the last character in the normalized text. This resets
* the <tt>Normalizer's</tt> position to be just before the
* the input text corresponding to that normalized character.
*/
UChar32 Normalizer::last() {
currentIndex=nextIndex=text->setToEnd();
clearBuffer();
return previous();
}
/**
* Retrieve the current iteration position in the input text that is
* being normalized. This method is useful in applications such as
* searching, where you need to be able to determine the position in
* the input text that corresponds to a given normalized output character.
* <p>
* <b>Note:</b> This method sets the position in the <em>input</em>, while
* {@link #next} and {@link #previous} iterate through characters in the
* <em>output</em>. This means that there is not necessarily a one-to-one
* correspondence between characters returned by <tt>next</tt> and
* <tt>previous</tt> and the indices passed to and returned from
* <tt>setIndex</tt> and {@link #getIndex}.
*
*/
int32_t Normalizer::getIndex() const {
if(bufferPos<buffer.length()) {
return currentIndex;
} else {
return nextIndex;
}
}
/**
* Retrieve the index of the start of the input text. This is the begin index
* of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the <tt>String</tt>
* over which this <tt>Normalizer</tt> is iterating
*/
int32_t Normalizer::startIndex() const {
return text->startIndex();
}
/**
* Retrieve the index of the end of the input text. This is the end index
* of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
* over which this <tt>Normalizer</tt> is iterating
*/
int32_t Normalizer::endIndex() const {
return text->endIndex();
}
//-------------------------------------------------------------------------
// Property access methods
//-------------------------------------------------------------------------
void
Normalizer::setMode(UNormalizationMode newMode)
{
fUMode = newMode;
init();
}
UNormalizationMode
Normalizer::getUMode() const
{
return fUMode;
}
void
Normalizer::setOption(int32_t option,
UBool value)
{
if (value) {
fOptions |= option;
} else {
fOptions &= (~option);
}
init();
}
UBool
Normalizer::getOption(int32_t option) const
{
return (fOptions & option) != 0;
}
/**
* Set the input text over which this <tt>Normalizer</tt> will iterate.
* The iteration position is set to the beginning of the input text.
*/
void
Normalizer::setText(const UnicodeString& newText,
UErrorCode &status)
{
if (U_FAILURE(status)) {
return;
}
CharacterIterator *newIter = new StringCharacterIterator(newText);
if (newIter == nullptr) {
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
delete text;
text = newIter;
reset();
}
/**
* Set the input text over which this <tt>Normalizer</tt> will iterate.
* The iteration position is set to the beginning of the string.
*/
void
Normalizer::setText(const CharacterIterator& newText,
UErrorCode &status)
{
if (U_FAILURE(status)) {
return;
}
CharacterIterator *newIter = newText.clone();
if (newIter == nullptr) {
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
delete text;
text = newIter;
reset();
}
void
Normalizer::setText(ConstChar16Ptr newText,
int32_t length,
UErrorCode &status)
{
if (U_FAILURE(status)) {
return;
}
CharacterIterator *newIter = new UCharCharacterIterator(newText, length);
if (newIter == nullptr) {
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
delete text;
text = newIter;
reset();
}
/**
* Copies the text under iteration into the UnicodeString referred to by "result".
* @param result Receives a copy of the text under iteration.
*/
void
Normalizer::getText(UnicodeString& result)
{
text->getText(result);
}
//-------------------------------------------------------------------------
// Private utility methods
//-------------------------------------------------------------------------
void Normalizer::clearBuffer() {
buffer.remove();
bufferPos=0;
}
UBool
Normalizer::nextNormalize() {
clearBuffer();
currentIndex=nextIndex;
text->setIndex(nextIndex);
if(!text->hasNext()) {
return false;
}
// Skip at least one character so we make progress.
UnicodeString segment(text->next32PostInc());
while(text->hasNext()) {
UChar32 c;
if(fNorm2->hasBoundaryBefore(c=text->next32PostInc())) {
text->move32(-1, CharacterIterator::kCurrent);
break;
}
segment.append(c);
}
nextIndex=text->getIndex();
UErrorCode errorCode=U_ZERO_ERROR;
fNorm2->normalize(segment, buffer, errorCode);
return U_SUCCESS(errorCode) && !buffer.isEmpty();
}
UBool
Normalizer::previousNormalize() {
clearBuffer();
nextIndex=currentIndex;
text->setIndex(currentIndex);
if(!text->hasPrevious()) {
return false;
}
UnicodeString segment;
while(text->hasPrevious()) {
UChar32 c=text->previous32();
segment.insert(0, c);
if(fNorm2->hasBoundaryBefore(c)) {
break;
}
}
currentIndex=text->getIndex();
UErrorCode errorCode=U_ZERO_ERROR;
fNorm2->normalize(segment, buffer, errorCode);
bufferPos=buffer.length();
return U_SUCCESS(errorCode) && !buffer.isEmpty();
}
U_NAMESPACE_END
#endif /* #if !UCONFIG_NO_NORMALIZATION */

View file

@ -0,0 +1,23 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
**********************************************************************
* Copyright (C) 2003-2003, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*/
#include "unicode/parsepos.h"
U_NAMESPACE_BEGIN
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(ParsePosition)
ParsePosition::~ParsePosition() {}
ParsePosition *
ParsePosition::clone() const {
return new ParsePosition(*this);
}
U_NAMESPACE_END

View file

@ -0,0 +1,230 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
* Copyright (C) 2011, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* file name: patternprops.cpp
* encoding: UTF-8
* tab size: 8 (not used)
* indentation:4
*
* created on: 2011mar13
* created by: Markus W. Scherer
*/
#include "unicode/utypes.h"
#include "patternprops.h"
U_NAMESPACE_BEGIN
/*
* One byte per Latin-1 character.
* Bit 0 is set if either Pattern property is true,
* bit 1 if Pattern_Syntax is true,
* bit 2 if Pattern_White_Space is true.
* That is, Pattern_Syntax is encoded as 3 and Pattern_White_Space as 5.
*/
static const uint8_t latin1[256]={
// WS: 9..D
0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
// WS: 20 Syntax: 21..2F
5, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
// Syntax: 3A..40
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3,
3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
// Syntax: 5B..5E
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
// Syntax: 60
3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
// Syntax: 7B..7E
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
// WS: 85
0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
// Syntax: A1..A7, A9, AB, AC, AE
0, 3, 3, 3, 3, 3, 3, 3, 0, 3, 0, 3, 3, 0, 3, 0,
// Syntax: B0, B1, B6, BB, BF
3, 3, 0, 0, 0, 0, 3, 0, 0, 0, 0, 3, 0, 0, 0, 3,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
// Syntax: D7
0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
// Syntax: F7
0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0
};
/*
* One byte per 32 characters from U+2000..U+303F indexing into
* a small table of 32-bit data words.
* The first two data words are all-zeros and all-ones.
*/
static const uint8_t index2000[130]={
2, 3, 4, 0, 0, 0, 0, 0, // 20xx
0, 0, 0, 0, 5, 1, 1, 1, // 21xx
1, 1, 1, 1, 1, 1, 1, 1, // 22xx
1, 1, 1, 1, 1, 1, 1, 1, // 23xx
1, 1, 1, 0, 0, 0, 0, 0, // 24xx
1, 1, 1, 1, 1, 1, 1, 1, // 25xx
1, 1, 1, 1, 1, 1, 1, 1, // 26xx
1, 1, 1, 6, 7, 1, 1, 1, // 27xx
1, 1, 1, 1, 1, 1, 1, 1, // 28xx
1, 1, 1, 1, 1, 1, 1, 1, // 29xx
1, 1, 1, 1, 1, 1, 1, 1, // 2Axx
1, 1, 1, 1, 1, 1, 1, 1, // 2Bxx
0, 0, 0, 0, 0, 0, 0, 0, // 2Cxx
0, 0, 0, 0, 0, 0, 0, 0, // 2Dxx
1, 1, 1, 1, 0, 0, 0, 0, // 2Exx
0, 0, 0, 0, 0, 0, 0, 0, // 2Fxx
8, 9 // 3000..303F
};
/*
* One 32-bit integer per 32 characters. Ranges of all-false and all-true
* are mapped to the first two values, other ranges map to appropriate bit patterns.
*/
static const uint32_t syntax2000[]={
0,
0xffffffff,
0xffff0000, // 2: 2010..201F
0x7fff00ff, // 3: 2020..2027, 2030..203E
0x7feffffe, // 4: 2041..2053, 2055..205E
0xffff0000, // 5: 2190..219F
0x003fffff, // 6: 2760..2775
0xfff00000, // 7: 2794..279F
0xffffff0e, // 8: 3001..3003, 3008..301F
0x00010001 // 9: 3020, 3030
};
/*
* Same as syntax2000, but with additional bits set for the
* Pattern_White_Space characters 200E 200F 2028 2029.
*/
static const uint32_t syntaxOrWhiteSpace2000[]={
0,
0xffffffff,
0xffffc000, // 2: 200E..201F
0x7fff03ff, // 3: 2020..2029, 2030..203E
0x7feffffe, // 4: 2041..2053, 2055..205E
0xffff0000, // 5: 2190..219F
0x003fffff, // 6: 2760..2775
0xfff00000, // 7: 2794..279F
0xffffff0e, // 8: 3001..3003, 3008..301F
0x00010001 // 9: 3020, 3030
};
UBool
PatternProps::isSyntax(UChar32 c) {
if(c<0) {
return false;
} else if(c<=0xff) {
return (UBool)(latin1[c]>>1)&1;
} else if(c<0x2010) {
return false;
} else if(c<=0x3030) {
uint32_t bits=syntax2000[index2000[(c-0x2000)>>5]];
return (UBool)((bits>>(c&0x1f))&1);
} else if(0xfd3e<=c && c<=0xfe46) {
return c<=0xfd3f || 0xfe45<=c;
} else {
return false;
}
}
UBool
PatternProps::isSyntaxOrWhiteSpace(UChar32 c) {
if(c<0) {
return false;
} else if(c<=0xff) {
return (UBool)(latin1[c]&1);
} else if(c<0x200e) {
return false;
} else if(c<=0x3030) {
uint32_t bits=syntaxOrWhiteSpace2000[index2000[(c-0x2000)>>5]];
return (UBool)((bits>>(c&0x1f))&1);
} else if(0xfd3e<=c && c<=0xfe46) {
return c<=0xfd3f || 0xfe45<=c;
} else {
return false;
}
}
UBool
PatternProps::isWhiteSpace(UChar32 c) {
if(c<0) {
return false;
} else if(c<=0xff) {
return (UBool)(latin1[c]>>2)&1;
} else if(0x200e<=c && c<=0x2029) {
return c<=0x200f || 0x2028<=c;
} else {
return false;
}
}
const char16_t *
PatternProps::skipWhiteSpace(const char16_t *s, int32_t length) {
while(length>0 && isWhiteSpace(*s)) {
++s;
--length;
}
return s;
}
int32_t
PatternProps::skipWhiteSpace(const UnicodeString& s, int32_t start) {
int32_t i = start;
int32_t length = s.length();
while(i<length && isWhiteSpace(s.charAt(i))) {
++i;
}
return i;
}
const char16_t *
PatternProps::trimWhiteSpace(const char16_t *s, int32_t &length) {
if(length<=0 || (!isWhiteSpace(s[0]) && !isWhiteSpace(s[length-1]))) {
return s;
}
int32_t start=0;
int32_t limit=length;
while(start<limit && isWhiteSpace(s[start])) {
++start;
}
if(start<limit) {
// There is non-white space at start; we will not move limit below that,
// so we need not test start<limit in the loop.
while(isWhiteSpace(s[limit-1])) {
--limit;
}
}
length=limit-start;
return s+start;
}
UBool
PatternProps::isIdentifier(const char16_t *s, int32_t length) {
if(length<=0) {
return false;
}
const char16_t *limit=s+length;
do {
if(isSyntaxOrWhiteSpace(*s++)) {
return false;
}
} while(s<limit);
return true;
}
const char16_t *
PatternProps::skipIdentifier(const char16_t *s, int32_t length) {
while(length>0 && !isSyntaxOrWhiteSpace(*s)) {
++s;
--length;
}
return s;
}
U_NAMESPACE_END

View file

@ -0,0 +1,98 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
* Copyright (C) 2011, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* file name: patternprops.h
* encoding: UTF-8
* tab size: 8 (not used)
* indentation:4
*
* created on: 2011mar13
* created by: Markus W. Scherer
*/
#ifndef __PATTERNPROPS_H__
#define __PATTERNPROPS_H__
#include "unicode/unistr.h"
#include "unicode/utypes.h"
U_NAMESPACE_BEGIN
/**
* Implements the immutable Unicode properties Pattern_Syntax and Pattern_White_Space.
* Hardcodes these properties, does not load data, does not depend on other ICU classes.
* <p>
* Note: Both properties include ASCII as well as non-ASCII, non-Latin-1 code points,
* and both properties only include BMP code points (no supplementary ones).
* Pattern_Syntax includes some unassigned code points.
* <p>
* [:Pattern_White_Space:] =
* [\u0009-\u000D\ \u0085\u200E\u200F\u2028\u2029]
* <p>
* [:Pattern_Syntax:] =
* [!-/\:-@\[-\^`\{-~\u00A1-\u00A7\u00A9\u00AB\u00AC\u00AE
* \u00B0\u00B1\u00B6\u00BB\u00BF\u00D7\u00F7
* \u2010-\u2027\u2030-\u203E\u2041-\u2053\u2055-\u205E
* \u2190-\u245F\u2500-\u2775\u2794-\u2BFF\u2E00-\u2E7F
* \u3001-\u3003\u3008-\u3020\u3030\uFD3E\uFD3F\uFE45\uFE46]
* @author mscherer
*/
class U_COMMON_API PatternProps {
public:
/**
* @return true if c is a Pattern_Syntax code point.
*/
static UBool isSyntax(UChar32 c);
/**
* @return true if c is a Pattern_Syntax or Pattern_White_Space code point.
*/
static UBool isSyntaxOrWhiteSpace(UChar32 c);
/**
* @return true if c is a Pattern_White_Space character.
*/
static UBool isWhiteSpace(UChar32 c);
/**
* Skips over Pattern_White_Space starting at s.
* @return The smallest pointer at or after s with a non-white space character.
*/
static const char16_t *skipWhiteSpace(const char16_t *s, int32_t length);
/**
* Skips over Pattern_White_Space starting at index start in s.
* @return The smallest index at or after start with a non-white space character.
*/
static int32_t skipWhiteSpace(const UnicodeString &s, int32_t start);
/**
* @return s except with leading and trailing Pattern_White_Space removed and length adjusted.
*/
static const char16_t *trimWhiteSpace(const char16_t *s, int32_t &length);
/**
* Tests whether the string contains a "pattern identifier", that is,
* whether it contains only non-Pattern_White_Space, non-Pattern_Syntax characters.
* @return true if there are no Pattern_White_Space or Pattern_Syntax characters in s.
*/
static UBool isIdentifier(const char16_t *s, int32_t length);
/**
* Skips over a "pattern identifier" starting at index s.
* @return The smallest pointer at or after s with
* a Pattern_White_Space or Pattern_Syntax character.
*/
static const char16_t *skipIdentifier(const char16_t *s, int32_t length);
private:
PatternProps() = delete; // no constructor: all static methods
};
U_NAMESPACE_END
#endif // __PATTERNPROPS_H__

View file

@ -0,0 +1,44 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
* Copyright (C) 2015, International Business Machines Corporation and
* others. All Rights Reserved.
*/
#include "unicode/unistr.h"
#include "charstr.h"
#include "cstring.h"
#include "pluralmap.h"
U_NAMESPACE_BEGIN
static const char * const gPluralForms[] = {
"other", "zero", "one", "two", "few", "many"};
PluralMapBase::Category
PluralMapBase::toCategory(const char *pluralForm) {
for (int32_t i = 0; i < UPRV_LENGTHOF(gPluralForms); ++i) {
if (uprv_strcmp(pluralForm, gPluralForms[i]) == 0) {
return static_cast<Category>(i);
}
}
return NONE;
}
PluralMapBase::Category
PluralMapBase::toCategory(const UnicodeString &pluralForm) {
CharString cCategory;
UErrorCode status = U_ZERO_ERROR;
cCategory.appendInvariantChars(pluralForm, status);
return U_FAILURE(status) ? NONE : toCategory(cCategory.data());
}
const char *PluralMapBase::getCategoryName(Category c) {
int32_t index = c;
return (index < 0 || index >= UPRV_LENGTHOF(gPluralForms)) ?
nullptr : gPluralForms[index];
}
U_NAMESPACE_END

View file

@ -0,0 +1,292 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
******************************************************************************
* Copyright (C) 2015, International Business Machines Corporation and
* others. All Rights Reserved.
******************************************************************************
*
* File pluralmap.h - PluralMap class that maps plural categories to values.
******************************************************************************
*/
#ifndef __PLURAL_MAP_H__
#define __PLURAL_MAP_H__
#include "unicode/uobject.h"
#include "cmemory.h"
U_NAMESPACE_BEGIN
class UnicodeString;
class U_COMMON_API PluralMapBase : public UMemory {
public:
/**
* The names of all the plural categories. NONE is not an actual plural
* category, but rather represents the absence of a plural category.
*/
enum Category {
NONE = -1,
OTHER,
ZERO,
ONE,
TWO,
FEW,
MANY,
CATEGORY_COUNT
};
/**
* Converts a category name such as "zero", "one", "two", "few", "many"
* or "other" to a category enum. Returns NONE for an unrecognized
* category name.
*/
static Category toCategory(const char *categoryName);
/**
* Converts a category name such as "zero", "one", "two", "few", "many"
* or "other" to a category enum. Returns NONE for unrecognized
* category name.
*/
static Category toCategory(const UnicodeString &categoryName);
/**
* Converts a category to a name.
* Passing NONE or CATEGORY_COUNT for category returns nullptr.
*/
static const char *getCategoryName(Category category);
};
/**
* A Map of plural categories to values. It maintains ownership of the
* values.
*
* Type T is the value type. T must provide the following:
* 1) Default constructor
* 2) Copy constructor
* 3) Assignment operator
* 4) Must extend UMemory
*/
template<typename T>
class PluralMap : public PluralMapBase {
public:
/**
* Other category is maps to a copy of the default value.
*/
PluralMap() : fOtherVariant() {
initializeNew();
}
/**
* Other category is mapped to otherVariant.
*/
PluralMap(const T &otherVariant) : fOtherVariant(otherVariant) {
initializeNew();
}
PluralMap(const PluralMap<T> &other) : fOtherVariant(other.fOtherVariant) {
fVariants[0] = &fOtherVariant;
for (int32_t i = 1; i < UPRV_LENGTHOF(fVariants); ++i) {
fVariants[i] = other.fVariants[i] ?
new T(*other.fVariants[i]) : nullptr;
}
}
PluralMap<T> &operator=(const PluralMap<T> &other) {
if (this == &other) {
return *this;
}
for (int32_t i = 0; i < UPRV_LENGTHOF(fVariants); ++i) {
if (fVariants[i] != nullptr && other.fVariants[i] != nullptr) {
*fVariants[i] = *other.fVariants[i];
} else if (fVariants[i] != nullptr) {
delete fVariants[i];
fVariants[i] = nullptr;
} else if (other.fVariants[i] != nullptr) {
fVariants[i] = new T(*other.fVariants[i]);
} else {
// do nothing
}
}
return *this;
}
~PluralMap() {
for (int32_t i = 1; i < UPRV_LENGTHOF(fVariants); ++i) {
delete fVariants[i];
}
}
/**
* Removes all mappings and makes 'other' point to the default value.
*/
void clear() {
*fVariants[0] = T();
for (int32_t i = 1; i < UPRV_LENGTHOF(fVariants); ++i) {
delete fVariants[i];
fVariants[i] = nullptr;
}
}
/**
* Iterates through the mappings in this instance, set index to NONE
* prior to using. Call next repeatedly to get the values until it
* returns nullptr. Each time next returns, caller may pass index
* to getCategoryName() to get the name of the plural category.
* When this function returns nullptr, index is CATEGORY_COUNT
*/
const T *next(Category &index) const {
int32_t idx = index;
++idx;
for (; idx < UPRV_LENGTHOF(fVariants); ++idx) {
if (fVariants[idx] != nullptr) {
index = static_cast<Category>(idx);
return fVariants[idx];
}
}
index = static_cast<Category>(idx);
return nullptr;
}
/**
* non const version of next.
*/
T *nextMutable(Category &index) {
const T *result = next(index);
return const_cast<T *>(result);
}
/**
* Returns the 'other' variant.
* Same as calling get(OTHER).
*/
const T &getOther() const {
return get(OTHER);
}
/**
* Returns the value associated with a category.
* If no value found, or v is NONE or CATEGORY_COUNT, falls
* back to returning the value for the 'other' category.
*/
const T &get(Category v) const {
int32_t index = v;
if (index < 0 || index >= UPRV_LENGTHOF(fVariants) || fVariants[index] == nullptr) {
return *fVariants[0];
}
return *fVariants[index];
}
/**
* Convenience routine to get the value by category name. Otherwise
* works just like get(Category).
*/
const T &get(const char *category) const {
return get(toCategory(category));
}
/**
* Convenience routine to get the value by category name as a
* UnicodeString. Otherwise works just like get(category).
*/
const T &get(const UnicodeString &category) const {
return get(toCategory(category));
}
/**
* Returns a pointer to the value associated with a category
* that caller can safely modify. If the value was defaulting to the 'other'
* variant because no explicit value was stored, this method creates a
* new value using the default constructor at the returned pointer.
*
* @param category the category with the value to change.
* @param status error returned here if index is NONE or CATEGORY_COUNT
* or memory could not be allocated, or any other error happens.
*/
T *getMutable(
Category category,
UErrorCode &status) {
return getMutable(category, nullptr, status);
}
/**
* Convenience routine to get a mutable pointer to a value by category name.
* Otherwise works just like getMutable(Category, UErrorCode &).
* reports an error if the category name is invalid.
*/
T *getMutable(
const char *category,
UErrorCode &status) {
return getMutable(toCategory(category), nullptr, status);
}
/**
* Just like getMutable(Category, UErrorCode &) but copies defaultValue to
* returned pointer if it was defaulting to the 'other' variant
* because no explicit value was stored.
*/
T *getMutableWithDefault(
Category category,
const T &defaultValue,
UErrorCode &status) {
return getMutable(category, &defaultValue, status);
}
/**
* Returns true if this object equals rhs.
*/
UBool equals(
const PluralMap<T> &rhs,
UBool (*eqFunc)(const T &, const T &)) const {
for (int32_t i = 0; i < UPRV_LENGTHOF(fVariants); ++i) {
if (fVariants[i] == rhs.fVariants[i]) {
continue;
}
if (fVariants[i] == nullptr || rhs.fVariants[i] == nullptr) {
return false;
}
if (!eqFunc(*fVariants[i], *rhs.fVariants[i])) {
return false;
}
}
return true;
}
private:
T fOtherVariant;
T* fVariants[6];
T *getMutable(
Category category,
const T *defaultValue,
UErrorCode &status) {
if (U_FAILURE(status)) {
return nullptr;
}
int32_t index = category;
if (index < 0 || index >= UPRV_LENGTHOF(fVariants)) {
status = U_ILLEGAL_ARGUMENT_ERROR;
return nullptr;
}
if (fVariants[index] == nullptr) {
fVariants[index] = defaultValue == nullptr ?
new T() : new T(*defaultValue);
}
if (!fVariants[index]) {
status = U_MEMORY_ALLOCATION_ERROR;
}
return fVariants[index];
}
void initializeNew() {
fVariants[0] = &fOtherVariant;
for (int32_t i = 1; i < UPRV_LENGTHOF(fVariants); ++i) {
fVariants[i] = nullptr;
}
}
};
U_NAMESPACE_END
#endif

View file

@ -0,0 +1,334 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
**********************************************************************
* Copyright (c) 2002-2014, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* Author: Alan Liu
* Created: October 30 2002
* Since: ICU 2.4
* 2010nov19 Markus Scherer Rewrite for formatVersion 2.
**********************************************************************
*/
#include "propname.h"
#include "unicode/uchar.h"
#include "unicode/udata.h"
#include "unicode/uscript.h"
#include "umutex.h"
#include "cmemory.h"
#include "cstring.h"
#include "uarrsort.h"
#include "uinvchar.h"
#define INCLUDED_FROM_PROPNAME_CPP
#include "propname_data.h"
U_CDECL_BEGIN
/**
* Get the next non-ignorable ASCII character from a property name
* and lowercases it.
* @return ((advance count for the name)<<8)|character
*/
static inline int32_t
getASCIIPropertyNameChar(const char *name) {
int32_t i;
char c;
/* Ignore delimiters '-', '_', and ASCII White_Space */
for(i=0;
(c=name[i++])==0x2d || c==0x5f ||
c==0x20 || (0x09<=c && c<=0x0d);
) {}
if(c!=0) {
return (i<<8)|(uint8_t)uprv_asciitolower((char)c);
} else {
return i<<8;
}
}
/**
* Get the next non-ignorable EBCDIC character from a property name
* and lowercases it.
* @return ((advance count for the name)<<8)|character
*/
static inline int32_t
getEBCDICPropertyNameChar(const char *name) {
int32_t i;
char c;
/* Ignore delimiters '-', '_', and EBCDIC White_Space */
for(i=0;
(c=name[i++])==0x60 || c==0x6d ||
c==0x40 || c==0x05 || c==0x15 || c==0x25 || c==0x0b || c==0x0c || c==0x0d;
) {}
if(c!=0) {
return (i<<8)|(uint8_t)uprv_ebcdictolower((char)c);
} else {
return i<<8;
}
}
/**
* Unicode property names and property value names are compared "loosely".
*
* UCD.html 4.0.1 says:
* For all property names, property value names, and for property values for
* Enumerated, Binary, or Catalog properties, use the following
* loose matching rule:
*
* LM3. Ignore case, whitespace, underscore ('_'), and hyphens.
*
* This function does just that, for (char *) name strings.
* It is almost identical to ucnv_compareNames() but also ignores
* C0 White_Space characters (U+0009..U+000d, and U+0085 on EBCDIC).
*
* @internal
*/
U_CAPI int32_t U_EXPORT2
uprv_compareASCIIPropertyNames(const char *name1, const char *name2) {
int32_t rc, r1, r2;
for(;;) {
r1=getASCIIPropertyNameChar(name1);
r2=getASCIIPropertyNameChar(name2);
/* If we reach the ends of both strings then they match */
if(((r1|r2)&0xff)==0) {
return 0;
}
/* Compare the lowercased characters */
if(r1!=r2) {
rc=(r1&0xff)-(r2&0xff);
if(rc!=0) {
return rc;
}
}
name1+=r1>>8;
name2+=r2>>8;
}
}
U_CAPI int32_t U_EXPORT2
uprv_compareEBCDICPropertyNames(const char *name1, const char *name2) {
int32_t rc, r1, r2;
for(;;) {
r1=getEBCDICPropertyNameChar(name1);
r2=getEBCDICPropertyNameChar(name2);
/* If we reach the ends of both strings then they match */
if(((r1|r2)&0xff)==0) {
return 0;
}
/* Compare the lowercased characters */
if(r1!=r2) {
rc=(r1&0xff)-(r2&0xff);
if(rc!=0) {
return rc;
}
}
name1+=r1>>8;
name2+=r2>>8;
}
}
U_CDECL_END
U_NAMESPACE_BEGIN
int32_t PropNameData::findProperty(int32_t property) {
int32_t i=1; // valueMaps index, initially after numRanges
for(int32_t numRanges=valueMaps[0]; numRanges>0; --numRanges) {
// Read and skip the start and limit of this range.
int32_t start=valueMaps[i];
int32_t limit=valueMaps[i+1];
i+=2;
if(property<start) {
break;
}
if(property<limit) {
return i+(property-start)*2;
}
i+=(limit-start)*2; // Skip all entries for this range.
}
return 0;
}
int32_t PropNameData::findPropertyValueNameGroup(int32_t valueMapIndex, int32_t value) {
if(valueMapIndex==0) {
return 0; // The property does not have named values.
}
++valueMapIndex; // Skip the BytesTrie offset.
int32_t numRanges=valueMaps[valueMapIndex++];
if(numRanges<0x10) {
// Ranges of values.
for(; numRanges>0; --numRanges) {
// Read and skip the start and limit of this range.
int32_t start=valueMaps[valueMapIndex];
int32_t limit=valueMaps[valueMapIndex+1];
valueMapIndex+=2;
if(value<start) {
break;
}
if(value<limit) {
return valueMaps[valueMapIndex+value-start];
}
valueMapIndex+=limit-start; // Skip all entries for this range.
}
} else {
// List of values.
int32_t valuesStart=valueMapIndex;
int32_t nameGroupOffsetsStart=valueMapIndex+numRanges-0x10;
do {
int32_t v=valueMaps[valueMapIndex];
if(value<v) {
break;
}
if(value==v) {
return valueMaps[nameGroupOffsetsStart+valueMapIndex-valuesStart];
}
} while(++valueMapIndex<nameGroupOffsetsStart);
}
return 0;
}
const char *PropNameData::getName(const char *nameGroup, int32_t nameIndex) {
int32_t numNames=*nameGroup++;
if(nameIndex<0 || numNames<=nameIndex) {
return nullptr;
}
// Skip nameIndex names.
for(; nameIndex>0; --nameIndex) {
nameGroup=uprv_strchr(nameGroup, 0)+1;
}
if(*nameGroup==0) {
return nullptr; // no name (Property[Value]Aliases.txt has "n/a")
}
return nameGroup;
}
UBool PropNameData::containsName(BytesTrie &trie, const char *name) {
if(name==nullptr) {
return false;
}
UStringTrieResult result=USTRINGTRIE_NO_VALUE;
char c;
while((c=*name++)!=0) {
c=uprv_invCharToLowercaseAscii(c);
// Ignore delimiters '-', '_', and ASCII White_Space.
if(c==0x2d || c==0x5f || c==0x20 || (0x09<=c && c<=0x0d)) {
continue;
}
if(!USTRINGTRIE_HAS_NEXT(result)) {
return false;
}
result=trie.next((uint8_t)c);
}
return USTRINGTRIE_HAS_VALUE(result);
}
const char *PropNameData::getPropertyName(int32_t property, int32_t nameChoice) {
int32_t valueMapIndex=findProperty(property);
if(valueMapIndex==0) {
return nullptr; // Not a known property.
}
return getName(nameGroups+valueMaps[valueMapIndex], nameChoice);
}
const char *PropNameData::getPropertyValueName(int32_t property, int32_t value, int32_t nameChoice) {
int32_t valueMapIndex=findProperty(property);
if(valueMapIndex==0) {
return nullptr; // Not a known property.
}
int32_t nameGroupOffset=findPropertyValueNameGroup(valueMaps[valueMapIndex+1], value);
if(nameGroupOffset==0) {
return nullptr;
}
return getName(nameGroups+nameGroupOffset, nameChoice);
}
int32_t PropNameData::getPropertyOrValueEnum(int32_t bytesTrieOffset, const char *alias) {
BytesTrie trie(bytesTries+bytesTrieOffset);
if(containsName(trie, alias)) {
return trie.getValue();
} else {
return UCHAR_INVALID_CODE;
}
}
int32_t PropNameData::getPropertyEnum(const char *alias) {
return getPropertyOrValueEnum(0, alias);
}
int32_t PropNameData::getPropertyValueEnum(int32_t property, const char *alias) {
int32_t valueMapIndex=findProperty(property);
if(valueMapIndex==0) {
return UCHAR_INVALID_CODE; // Not a known property.
}
valueMapIndex=valueMaps[valueMapIndex+1];
if(valueMapIndex==0) {
return UCHAR_INVALID_CODE; // The property does not have named values.
}
// valueMapIndex is the start of the property's valueMap,
// where the first word is the BytesTrie offset.
return getPropertyOrValueEnum(valueMaps[valueMapIndex], alias);
}
U_NAMESPACE_END
//----------------------------------------------------------------------
// Public API implementation
U_CAPI const char* U_EXPORT2
u_getPropertyName(UProperty property,
UPropertyNameChoice nameChoice) UPRV_NO_SANITIZE_UNDEFINED {
// The nameChoice is really an integer with a couple of named constants.
// Unicode allows for names other than short and long ones.
// If present, these will be returned for U_LONG_PROPERTY_NAME + i, where i=1, 2,...
U_NAMESPACE_USE
return PropNameData::getPropertyName(property, nameChoice);
}
U_CAPI UProperty U_EXPORT2
u_getPropertyEnum(const char* alias) {
U_NAMESPACE_USE
return (UProperty)PropNameData::getPropertyEnum(alias);
}
U_CAPI const char* U_EXPORT2
u_getPropertyValueName(UProperty property,
int32_t value,
UPropertyNameChoice nameChoice) UPRV_NO_SANITIZE_UNDEFINED {
// The nameChoice is really an integer with a couple of named constants.
// Unicode allows for names other than short and long ones.
// If present, these will be returned for U_LONG_PROPERTY_NAME + i, where i=1, 2,...
U_NAMESPACE_USE
return PropNameData::getPropertyValueName(property, value, nameChoice);
}
U_CAPI int32_t U_EXPORT2
u_getPropertyValueEnum(UProperty property,
const char* alias) {
U_NAMESPACE_USE
return PropNameData::getPropertyValueEnum(property, alias);
}
U_CAPI const char* U_EXPORT2
uscript_getName(UScriptCode scriptCode){
return u_getPropertyValueName(UCHAR_SCRIPT, scriptCode,
U_LONG_PROPERTY_NAME);
}
U_CAPI const char* U_EXPORT2
uscript_getShortName(UScriptCode scriptCode){
return u_getPropertyValueName(UCHAR_SCRIPT, scriptCode,
U_SHORT_PROPERTY_NAME);
}

View file

@ -0,0 +1,212 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
**********************************************************************
* Copyright (c) 2002-2011, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* Author: Alan Liu
* Created: October 30 2002
* Since: ICU 2.4
* 2010nov19 Markus Scherer Rewrite for formatVersion 2.
**********************************************************************
*/
#ifndef PROPNAME_H
#define PROPNAME_H
#include "unicode/utypes.h"
#include "unicode/bytestrie.h"
#include "unicode/uchar.h"
#include "udataswp.h"
#include "uprops.h"
/*
* This header defines the in-memory layout of the property names data
* structure representing the UCD data files PropertyAliases.txt and
* PropertyValueAliases.txt. It is used by:
* propname.cpp - reads data
* genpname - creates data
*/
/* low-level char * property name comparison -------------------------------- */
U_CDECL_BEGIN
/**
* \var uprv_comparePropertyNames
* Unicode property names and property value names are compared "loosely".
*
* UCD.html 4.0.1 says:
* For all property names, property value names, and for property values for
* Enumerated, Binary, or Catalog properties, use the following
* loose matching rule:
*
* LM3. Ignore case, whitespace, underscore ('_'), and hyphens.
*
* This function does just that, for (char *) name strings.
* It is almost identical to ucnv_compareNames() but also ignores
* C0 White_Space characters (U+0009..U+000d, and U+0085 on EBCDIC).
*
* @internal
*/
U_CAPI int32_t U_EXPORT2
uprv_compareASCIIPropertyNames(const char *name1, const char *name2);
U_CAPI int32_t U_EXPORT2
uprv_compareEBCDICPropertyNames(const char *name1, const char *name2);
#if U_CHARSET_FAMILY==U_ASCII_FAMILY
# define uprv_comparePropertyNames uprv_compareASCIIPropertyNames
#elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY
# define uprv_comparePropertyNames uprv_compareEBCDICPropertyNames
#else
# error U_CHARSET_FAMILY is not valid
#endif
U_CDECL_END
/* UDataMemory structure and signatures ------------------------------------- */
#define PNAME_DATA_NAME "pnames"
#define PNAME_DATA_TYPE "icu"
/* Fields in UDataInfo: */
/* PNAME_SIG[] is encoded as numeric literals for compatibility with the HP compiler */
#define PNAME_SIG_0 ((uint8_t)0x70) /* p */
#define PNAME_SIG_1 ((uint8_t)0x6E) /* n */
#define PNAME_SIG_2 ((uint8_t)0x61) /* a */
#define PNAME_SIG_3 ((uint8_t)0x6D) /* m */
U_NAMESPACE_BEGIN
class PropNameData {
public:
enum {
// Byte offsets from the start of the data, after the generic header.
IX_VALUE_MAPS_OFFSET,
IX_BYTE_TRIES_OFFSET,
IX_NAME_GROUPS_OFFSET,
IX_RESERVED3_OFFSET,
IX_RESERVED4_OFFSET,
IX_TOTAL_SIZE,
// Other values.
IX_MAX_NAME_LENGTH,
IX_RESERVED7,
IX_COUNT
};
static const char *getPropertyName(int32_t property, int32_t nameChoice);
static const char *getPropertyValueName(int32_t property, int32_t value, int32_t nameChoice);
static int32_t getPropertyEnum(const char *alias);
static int32_t getPropertyValueEnum(int32_t property, const char *alias);
private:
static int32_t findProperty(int32_t property);
static int32_t findPropertyValueNameGroup(int32_t valueMapIndex, int32_t value);
static const char *getName(const char *nameGroup, int32_t nameIndex);
static UBool containsName(BytesTrie &trie, const char *name);
static int32_t getPropertyOrValueEnum(int32_t bytesTrieOffset, const char *alias);
static const int32_t indexes[];
static const int32_t valueMaps[];
static const uint8_t bytesTries[];
static const char nameGroups[];
};
/*
* pnames.icu formatVersion 2
*
* formatVersion 2 is new in ICU 4.8.
* In ICU 4.8, the pnames.icu data file is used only in ICU4J.
* ICU4C 4.8 has the same data structures hardcoded in source/common/propname_data.h.
*
* For documentation of pnames.icu formatVersion 1 see ICU4C 4.6 (2010-dec-01)
* or earlier versions of this header file (source/common/propname.h).
*
* The pnames.icu begins with the standard ICU DataHeader/UDataInfo.
* After that:
*
* int32_t indexes[8];
*
* (See the PropNameData::IX_... constants.)
*
* The first 6 indexes are byte offsets from the beginning of the data
* (beginning of indexes[]) to following structures.
* The length of each structure is the difference between its offset
* and the next one.
* All offsets are filled in: Where there is no data between two offsets,
* those two offsets are the same.
* The last offset (indexes[PropNameData::IX_TOTAL_SIZE]) indicates the
* total number of bytes in the file. (Not counting the standard headers.)
*
* The sixth index (indexes[PropNameData::IX_MAX_NAME_LENGTH]) has the
* maximum length of any Unicode property (or property value) alias.
* (Without normalization, that is, including underscores etc.)
*
* int32_t valueMaps[];
*
* The valueMaps[] begins with a map from UProperty enums to properties,
* followed by the per-property value maps from property values to names,
* for those properties that have named values.
* (Binary & enumerated, plus General_Category_Mask.)
*
* valueMaps[0] contains the number of UProperty enum ranges.
* For each range:
* int32_t start, limit -- first and last+1 UProperty enum of a dense range
* Followed by (limit-start) pairs of
* int32_t nameGroupOffset;
* Offset into nameGroups[] for the property's names/aliases.
* int32_t valueMapIndex;
* Offset of the property's value map in the valueMaps[] array.
* If the valueMapIndex is 0, then the property does not have named values.
*
* For each property's value map:
* int32_t bytesTrieOffset; -- Offset into bytesTries[] for name->value mapping.
* int32_t numRanges;
* If numRanges is in the range 1..15, then that many ranges of values follow.
* Per range:
* int32_t start, limit -- first and last+1 UProperty enum of a range
* Followed by (limit-start) entries of
* int32_t nameGroupOffset;
* Offset into nameGroups[] for the property value's names/aliases.
* If the nameGroupOffset is 0, then this is not a named value for this property.
* (That is, the ranges need not be dense.)
* If numRanges is >=0x10, then (numRanges-0x10) sorted values
* and then (numRanges-0x10) corresponding nameGroupOffsets follow.
* Values are sorted as signed integers.
* In this case, the set of values is dense; no nameGroupOffset will be 0.
*
* For both properties and property values, ranges are sorted by their start/limit values.
*
* uint8_t bytesTries[];
*
* This is a sequence of BytesTrie structures, byte-serialized tries for
* mapping from names/aliases to values.
* The first one maps from property names/aliases to UProperty enum constants.
* The following ones are indexed by property value map bytesTrieOffsets
* for mapping each property's names/aliases to their property values.
*
* char nameGroups[];
*
* This is a sequence of property name groups.
* Each group is a list of names/aliases (invariant-character strings) for
* one property or property value, in the order of UCharNameChoice.
* The first byte of each group is the number of names in the group.
* It is followed by that many NUL-terminated strings.
* The first string is for the short name; if there is no short name,
* then the first string is empty.
* The second string is the long name. Further strings are additional aliases.
*
* The first name group is for a property rather than a property value,
* so that a nameGroupOffset of 0 can be used to indicate "no value"
* in a property's sparse value ranges.
*/
U_NAMESPACE_END
#endif

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,529 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
*
* Copyright (C) 2002-2011, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: propsvec.c
* encoding: UTF-8
* tab size: 8 (not used)
* indentation:4
*
* created on: 2002feb22
* created by: Markus W. Scherer
*
* Store bits (Unicode character properties) in bit set vectors.
*/
#include <stdlib.h>
#include "unicode/utypes.h"
#include "cmemory.h"
#include "utrie.h"
#include "utrie2.h"
#include "uarrsort.h"
#include "propsvec.h"
#include "uassert.h"
struct UPropsVectors {
uint32_t *v;
int32_t columns; /* number of columns, plus two for start & limit values */
int32_t maxRows;
int32_t rows;
int32_t prevRow; /* search optimization: remember last row seen */
UBool isCompacted;
};
#define UPVEC_INITIAL_ROWS (1<<12)
#define UPVEC_MEDIUM_ROWS ((int32_t)1<<16)
#define UPVEC_MAX_ROWS (UPVEC_MAX_CP+1)
U_CAPI UPropsVectors * U_EXPORT2
upvec_open(int32_t columns, UErrorCode *pErrorCode) {
UPropsVectors *pv;
uint32_t *v, *row;
uint32_t cp;
if(U_FAILURE(*pErrorCode)) {
return nullptr;
}
if(columns<1) {
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
return nullptr;
}
columns+=2; /* count range start and limit columns */
pv=(UPropsVectors *)uprv_malloc(sizeof(UPropsVectors));
v=(uint32_t *)uprv_malloc(UPVEC_INITIAL_ROWS*columns*4);
if(pv==nullptr || v==nullptr) {
uprv_free(pv);
uprv_free(v);
*pErrorCode=U_MEMORY_ALLOCATION_ERROR;
return nullptr;
}
uprv_memset(pv, 0, sizeof(UPropsVectors));
pv->v=v;
pv->columns=columns;
pv->maxRows=UPVEC_INITIAL_ROWS;
pv->rows=2+(UPVEC_MAX_CP-UPVEC_FIRST_SPECIAL_CP);
/* set the all-Unicode row and the special-value rows */
row=pv->v;
uprv_memset(row, 0, pv->rows*columns*4);
row[0]=0;
row[1]=0x110000;
row+=columns;
for(cp=UPVEC_FIRST_SPECIAL_CP; cp<=UPVEC_MAX_CP; ++cp) {
row[0]=cp;
row[1]=cp+1;
row+=columns;
}
return pv;
}
U_CAPI void U_EXPORT2
upvec_close(UPropsVectors *pv) {
if(pv!=nullptr) {
uprv_free(pv->v);
uprv_free(pv);
}
}
static uint32_t *
_findRow(UPropsVectors *pv, UChar32 rangeStart) {
uint32_t *row;
int32_t columns, i, start, limit, prevRow;
columns=pv->columns;
limit=pv->rows;
prevRow=pv->prevRow;
/* check the vicinity of the last-seen row (start searching with an unrolled loop) */
row=pv->v+prevRow*columns;
if(rangeStart>=(UChar32)row[0]) {
if(rangeStart<(UChar32)row[1]) {
/* same row as last seen */
return row;
} else if(rangeStart<(UChar32)(row+=columns)[1]) {
/* next row after the last one */
pv->prevRow=prevRow+1;
return row;
} else if(rangeStart<(UChar32)(row+=columns)[1]) {
/* second row after the last one */
pv->prevRow=prevRow+2;
return row;
} else if((rangeStart-(UChar32)row[1])<10) {
/* we are close, continue looping */
prevRow+=2;
do {
++prevRow;
row+=columns;
} while(rangeStart>=(UChar32)row[1]);
pv->prevRow=prevRow;
return row;
}
} else if(rangeStart<(UChar32)pv->v[1]) {
/* the very first row */
pv->prevRow=0;
return pv->v;
}
/* do a binary search for the start of the range */
start=0;
while(start<limit-1) {
i=(start+limit)/2;
row=pv->v+i*columns;
if(rangeStart<(UChar32)row[0]) {
limit=i;
} else if(rangeStart<(UChar32)row[1]) {
pv->prevRow=i;
return row;
} else {
start=i;
}
}
/* must be found because all ranges together always cover all of Unicode */
pv->prevRow=start;
return pv->v+start*columns;
}
U_CAPI void U_EXPORT2
upvec_setValue(UPropsVectors *pv,
UChar32 start, UChar32 end,
int32_t column,
uint32_t value, uint32_t mask,
UErrorCode *pErrorCode) {
uint32_t *firstRow, *lastRow;
int32_t columns;
UChar32 limit;
UBool splitFirstRow, splitLastRow;
/* argument checking */
if(U_FAILURE(*pErrorCode)) {
return;
}
if( pv==nullptr ||
start<0 || start>end || end>UPVEC_MAX_CP ||
column<0 || column>=(pv->columns-2)
) {
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
return;
}
if(pv->isCompacted) {
*pErrorCode=U_NO_WRITE_PERMISSION;
return;
}
limit=end+1;
/* initialize */
columns=pv->columns;
column+=2; /* skip range start and limit columns */
value&=mask;
/* find the rows whose ranges overlap with the input range */
/* find the first and last rows, always successful */
firstRow=_findRow(pv, start);
lastRow=_findRow(pv, end);
/*
* Rows need to be split if they partially overlap with the
* input range (only possible for the first and last rows)
* and if their value differs from the input value.
*/
splitFirstRow= (UBool)(start!=(UChar32)firstRow[0] && value!=(firstRow[column]&mask));
splitLastRow= (UBool)(limit!=(UChar32)lastRow[1] && value!=(lastRow[column]&mask));
/* split first/last rows if necessary */
if(splitFirstRow || splitLastRow) {
int32_t count, rows;
rows=pv->rows;
if((rows+splitFirstRow+splitLastRow)>pv->maxRows) {
uint32_t *newVectors;
int32_t newMaxRows;
if(pv->maxRows<UPVEC_MEDIUM_ROWS) {
newMaxRows=UPVEC_MEDIUM_ROWS;
} else if(pv->maxRows<UPVEC_MAX_ROWS) {
newMaxRows=UPVEC_MAX_ROWS;
} else {
/* Implementation bug, or UPVEC_MAX_ROWS too low. */
*pErrorCode=U_INTERNAL_PROGRAM_ERROR;
return;
}
newVectors=(uint32_t *)uprv_malloc(newMaxRows*columns*4);
if(newVectors==nullptr) {
*pErrorCode=U_MEMORY_ALLOCATION_ERROR;
return;
}
uprv_memcpy(newVectors, pv->v, (size_t)rows*columns*4);
firstRow=newVectors+(firstRow-pv->v);
lastRow=newVectors+(lastRow-pv->v);
uprv_free(pv->v);
pv->v=newVectors;
pv->maxRows=newMaxRows;
}
/* count the number of row cells to move after the last row, and move them */
count = (int32_t)((pv->v+rows*columns)-(lastRow+columns));
if(count>0) {
uprv_memmove(
lastRow+(1+splitFirstRow+splitLastRow)*columns,
lastRow+columns,
count*4);
}
pv->rows=rows+splitFirstRow+splitLastRow;
/* split the first row, and move the firstRow pointer to the second part */
if(splitFirstRow) {
/* copy all affected rows up one and move the lastRow pointer */
count = (int32_t)((lastRow-firstRow)+columns);
uprv_memmove(firstRow+columns, firstRow, (size_t)count*4);
lastRow+=columns;
/* split the range and move the firstRow pointer */
firstRow[1]=firstRow[columns]=(uint32_t)start;
firstRow+=columns;
}
/* split the last row */
if(splitLastRow) {
/* copy the last row data */
uprv_memcpy(lastRow+columns, lastRow, (size_t)columns*4);
/* split the range and move the firstRow pointer */
lastRow[1]=lastRow[columns]=(uint32_t)limit;
}
}
/* set the "row last seen" to the last row for the range */
pv->prevRow=(int32_t)((lastRow-(pv->v))/columns);
/* set the input value in all remaining rows */
firstRow+=column;
lastRow+=column;
mask=~mask;
for(;;) {
*firstRow=(*firstRow&mask)|value;
if(firstRow==lastRow) {
break;
}
firstRow+=columns;
}
}
U_CAPI uint32_t U_EXPORT2
upvec_getValue(const UPropsVectors *pv, UChar32 c, int32_t column) {
uint32_t *row;
UPropsVectors *ncpv;
if(pv->isCompacted || c<0 || c>UPVEC_MAX_CP || column<0 || column>=(pv->columns-2)) {
return 0;
}
ncpv=(UPropsVectors *)pv;
row=_findRow(ncpv, c);
return row[2+column];
}
U_CAPI uint32_t * U_EXPORT2
upvec_getRow(const UPropsVectors *pv, int32_t rowIndex,
UChar32 *pRangeStart, UChar32 *pRangeEnd) {
uint32_t *row;
int32_t columns;
if(pv->isCompacted || rowIndex<0 || rowIndex>=pv->rows) {
return nullptr;
}
columns=pv->columns;
row=pv->v+rowIndex*columns;
if(pRangeStart!=nullptr) {
*pRangeStart=(UChar32)row[0];
}
if(pRangeEnd!=nullptr) {
*pRangeEnd=(UChar32)row[1]-1;
}
return row+2;
}
static int32_t U_CALLCONV
upvec_compareRows(const void *context, const void *l, const void *r) {
const uint32_t *left=(const uint32_t *)l, *right=(const uint32_t *)r;
const UPropsVectors *pv=(const UPropsVectors *)context;
int32_t i, count, columns;
count=columns=pv->columns; /* includes start/limit columns */
/* start comparing after start/limit but wrap around to them */
i=2;
do {
if(left[i]!=right[i]) {
return left[i]<right[i] ? -1 : 1;
}
if(++i==columns) {
i=0;
}
} while(--count>0);
return 0;
}
U_CAPI void U_EXPORT2
upvec_compact(UPropsVectors *pv, UPVecCompactHandler *handler, void *context, UErrorCode *pErrorCode) {
uint32_t *row;
int32_t i, columns, valueColumns, rows, count;
UChar32 start, limit;
/* argument checking */
if(U_FAILURE(*pErrorCode)) {
return;
}
if(handler==nullptr) {
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
return;
}
if(pv->isCompacted) {
return;
}
/* Set the flag now: Sorting and compacting destroys the builder data structure. */
pv->isCompacted=true;
rows=pv->rows;
columns=pv->columns;
U_ASSERT(columns>=3); /* upvec_open asserts this */
valueColumns=columns-2; /* not counting start & limit */
/* sort the properties vectors to find unique vector values */
uprv_sortArray(pv->v, rows, columns*4,
upvec_compareRows, pv, false, pErrorCode);
if(U_FAILURE(*pErrorCode)) {
return;
}
/*
* Find and set the special values.
* This has to do almost the same work as the compaction below,
* to find the indexes where the special-value rows will move.
*/
row=pv->v;
count=-valueColumns;
for(i=0; i<rows; ++i) {
start=(UChar32)row[0];
/* count a new values vector if it is different from the current one */
if(count<0 || 0!=uprv_memcmp(row+2, row-valueColumns, valueColumns*4)) {
count+=valueColumns;
}
if(start>=UPVEC_FIRST_SPECIAL_CP) {
handler(context, start, start, count, row+2, valueColumns, pErrorCode);
if(U_FAILURE(*pErrorCode)) {
return;
}
}
row+=columns;
}
/* count is at the beginning of the last vector, add valueColumns to include that last vector */
count+=valueColumns;
/* Call the handler once more to signal the start of delivering real values. */
handler(context, UPVEC_START_REAL_VALUES_CP, UPVEC_START_REAL_VALUES_CP,
count, row-valueColumns, valueColumns, pErrorCode);
if(U_FAILURE(*pErrorCode)) {
return;
}
/*
* Move vector contents up to a contiguous array with only unique
* vector values, and call the handler function for each vector.
*
* This destroys the Properties Vector structure and replaces it
* with an array of just vector values.
*/
row=pv->v;
count=-valueColumns;
for(i=0; i<rows; ++i) {
/* fetch these first before memmove() may overwrite them */
start=(UChar32)row[0];
limit=(UChar32)row[1];
/* add a new values vector if it is different from the current one */
if(count<0 || 0!=uprv_memcmp(row+2, pv->v+count, valueColumns*4)) {
count+=valueColumns;
uprv_memmove(pv->v+count, row+2, (size_t)valueColumns*4);
}
if(start<UPVEC_FIRST_SPECIAL_CP) {
handler(context, start, limit-1, count, pv->v+count, valueColumns, pErrorCode);
if(U_FAILURE(*pErrorCode)) {
return;
}
}
row+=columns;
}
/* count is at the beginning of the last vector, add one to include that last vector */
pv->rows=count/valueColumns+1;
}
U_CAPI const uint32_t * U_EXPORT2
upvec_getArray(const UPropsVectors *pv, int32_t *pRows, int32_t *pColumns) {
if(!pv->isCompacted) {
return nullptr;
}
if(pRows!=nullptr) {
*pRows=pv->rows;
}
if(pColumns!=nullptr) {
*pColumns=pv->columns-2;
}
return pv->v;
}
U_CAPI uint32_t * U_EXPORT2
upvec_cloneArray(const UPropsVectors *pv,
int32_t *pRows, int32_t *pColumns, UErrorCode *pErrorCode) {
uint32_t *clonedArray;
int32_t byteLength;
if(U_FAILURE(*pErrorCode)) {
return nullptr;
}
if(!pv->isCompacted) {
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
return nullptr;
}
byteLength=pv->rows*(pv->columns-2)*4;
clonedArray=(uint32_t *)uprv_malloc(byteLength);
if(clonedArray==nullptr) {
*pErrorCode=U_MEMORY_ALLOCATION_ERROR;
return nullptr;
}
uprv_memcpy(clonedArray, pv->v, byteLength);
if(pRows!=nullptr) {
*pRows=pv->rows;
}
if(pColumns!=nullptr) {
*pColumns=pv->columns-2;
}
return clonedArray;
}
U_CAPI UTrie2 * U_EXPORT2
upvec_compactToUTrie2WithRowIndexes(UPropsVectors *pv, UErrorCode *pErrorCode) {
UPVecToUTrie2Context toUTrie2={ nullptr, 0, 0, 0 };
upvec_compact(pv, upvec_compactToUTrie2Handler, &toUTrie2, pErrorCode);
utrie2_freeze(toUTrie2.trie, UTRIE2_16_VALUE_BITS, pErrorCode);
if(U_FAILURE(*pErrorCode)) {
utrie2_close(toUTrie2.trie);
toUTrie2.trie=nullptr;
}
return toUTrie2.trie;
}
/*
* TODO(markus): Add upvec_16BitsToUTrie2() function that enumerates all rows, extracts
* some 16-bit field and builds and returns a UTrie2.
*/
U_CAPI void U_CALLCONV
upvec_compactToUTrie2Handler(void *context,
UChar32 start, UChar32 end,
int32_t rowIndex, uint32_t *row, int32_t columns,
UErrorCode *pErrorCode) {
(void)row;
(void)columns;
UPVecToUTrie2Context *toUTrie2=(UPVecToUTrie2Context *)context;
if(start<UPVEC_FIRST_SPECIAL_CP) {
utrie2_setRange32(toUTrie2->trie, start, end, (uint32_t)rowIndex, true, pErrorCode);
} else {
switch(start) {
case UPVEC_INITIAL_VALUE_CP:
toUTrie2->initialValue=rowIndex;
break;
case UPVEC_ERROR_VALUE_CP:
toUTrie2->errorValue=rowIndex;
break;
case UPVEC_START_REAL_VALUES_CP:
toUTrie2->maxValue=rowIndex;
if(rowIndex>0xffff) {
/* too many rows for a 16-bit trie */
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
} else {
toUTrie2->trie=utrie2_open(toUTrie2->initialValue,
toUTrie2->errorValue, pErrorCode);
}
break;
default:
break;
}
}
}

View file

@ -0,0 +1,178 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
*
* Copyright (C) 2002-2010, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: propsvec.h
* encoding: UTF-8
* tab size: 8 (not used)
* indentation:4
*
* created on: 2002feb22
* created by: Markus W. Scherer
*
* Store bits (Unicode character properties) in bit set vectors.
*/
#ifndef __UPROPSVEC_H__
#define __UPROPSVEC_H__
#include "unicode/utypes.h"
#include "utrie.h"
#include "utrie2.h"
U_CDECL_BEGIN
/**
* Unicode Properties Vectors associated with code point ranges.
*
* Rows of uint32_t integers in a contiguous array store
* the range limits and the properties vectors.
*
* Logically, each row has a certain number of uint32_t values,
* which is set via the upvec_open() "columns" parameter.
*
* Internally, two additional columns are stored.
* In each internal row,
* row[0] contains the start code point and
* row[1] contains the limit code point,
* which is the start of the next range.
*
* Initially, there is only one "normal" row for
* range [0..0x110000[ with values 0.
* There are additional rows for special purposes, see UPVEC_FIRST_SPECIAL_CP.
*
* It would be possible to store only one range boundary per row,
* but self-contained rows allow to later sort them by contents.
*/
struct UPropsVectors;
typedef struct UPropsVectors UPropsVectors;
/*
* Special pseudo code points for storing the initialValue and the errorValue,
* which are used to initialize a UTrie2 or similar.
*/
#define UPVEC_FIRST_SPECIAL_CP 0x110000
#define UPVEC_INITIAL_VALUE_CP 0x110000
#define UPVEC_ERROR_VALUE_CP 0x110001
#define UPVEC_MAX_CP 0x110001
/*
* Special pseudo code point used in upvec_compact() signalling the end of
* delivering special values and the beginning of delivering real ones.
* Stable value, unlike UPVEC_MAX_CP which might grow over time.
*/
#define UPVEC_START_REAL_VALUES_CP 0x200000
/*
* Open a UPropsVectors object.
* @param columns Number of value integers (uint32_t) per row.
*/
U_CAPI UPropsVectors * U_EXPORT2
upvec_open(int32_t columns, UErrorCode *pErrorCode);
U_CAPI void U_EXPORT2
upvec_close(UPropsVectors *pv);
/*
* In rows for code points [start..end], select the column,
* reset the mask bits and set the value bits (ANDed with the mask).
*
* Will set U_NO_WRITE_PERMISSION if called after upvec_compact().
*/
U_CAPI void U_EXPORT2
upvec_setValue(UPropsVectors *pv,
UChar32 start, UChar32 end,
int32_t column,
uint32_t value, uint32_t mask,
UErrorCode *pErrorCode);
/*
* Logically const but must not be used on the same pv concurrently!
* Always returns 0 if called after upvec_compact().
*/
U_CAPI uint32_t U_EXPORT2
upvec_getValue(const UPropsVectors *pv, UChar32 c, int32_t column);
/*
* pRangeStart and pRangeEnd can be NULL.
* @return NULL if rowIndex out of range and for illegal arguments,
* or if called after upvec_compact()
*/
U_CAPI uint32_t * U_EXPORT2
upvec_getRow(const UPropsVectors *pv, int32_t rowIndex,
UChar32 *pRangeStart, UChar32 *pRangeEnd);
/*
* Compact the vectors:
* - modify the memory
* - keep only unique vectors
* - store them contiguously from the beginning of the memory
* - for each (non-unique) row, call the handler function
*
* The handler's rowIndex is the index of the row in the compacted
* memory block.
* (Therefore, it starts at 0 increases in increments of the columns value.)
*
* In a first phase, only special values are delivered (each exactly once),
* with start==end both equalling a special pseudo code point.
* Then the handler is called once more with start==end==UPVEC_START_REAL_VALUES_CP
* where rowIndex is the length of the compacted array,
* and the row is arbitrary (but not NULL).
* Then, in the second phase, the handler is called for each row of real values.
*/
typedef void U_CALLCONV
UPVecCompactHandler(void *context,
UChar32 start, UChar32 end,
int32_t rowIndex, uint32_t *row, int32_t columns,
UErrorCode *pErrorCode);
U_CAPI void U_EXPORT2
upvec_compact(UPropsVectors *pv, UPVecCompactHandler *handler, void *context, UErrorCode *pErrorCode);
/*
* Get the vectors array after calling upvec_compact().
* The caller must not modify nor release the returned array.
* Returns NULL if called before upvec_compact().
*/
U_CAPI const uint32_t * U_EXPORT2
upvec_getArray(const UPropsVectors *pv, int32_t *pRows, int32_t *pColumns);
/*
* Get a clone of the vectors array after calling upvec_compact().
* The caller owns the returned array and must uprv_free() it.
* Returns NULL if called before upvec_compact().
*/
U_CAPI uint32_t * U_EXPORT2
upvec_cloneArray(const UPropsVectors *pv,
int32_t *pRows, int32_t *pColumns, UErrorCode *pErrorCode);
/*
* Call upvec_compact(), create a 16-bit UTrie2 with indexes into the compacted
* vectors array, and freeze the trie.
*/
U_CAPI UTrie2 * U_EXPORT2
upvec_compactToUTrie2WithRowIndexes(UPropsVectors *pv, UErrorCode *pErrorCode);
struct UPVecToUTrie2Context {
UTrie2 *trie;
int32_t initialValue;
int32_t errorValue;
int32_t maxValue;
};
typedef struct UPVecToUTrie2Context UPVecToUTrie2Context;
/* context=UPVecToUTrie2Context, creates the trie and stores the rowIndex values */
U_CAPI void U_CALLCONV
upvec_compactToUTrie2Handler(void *context,
UChar32 start, UChar32 end,
int32_t rowIndex, uint32_t *row, int32_t columns,
UErrorCode *pErrorCode);
U_CDECL_END
#endif

View file

@ -0,0 +1,590 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
*
* Copyright (C) 2002-2011, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: punycode.cpp
* encoding: UTF-8
* tab size: 8 (not used)
* indentation:4
*
* created on: 2002jan31
* created by: Markus W. Scherer
*/
/* This ICU code derived from: */
/*
punycode.c 0.4.0 (2001-Nov-17-Sat)
http://www.cs.berkeley.edu/~amc/idn/
Adam M. Costello
http://www.nicemice.net/amc/
Disclaimer and license
Regarding this entire document or any portion of it (including
the pseudocode and C code), the author makes no guarantees and
is not responsible for any damage resulting from its use. The
author grants irrevocable permission to anyone to use, modify,
and distribute it in any way that does not diminish the rights
of anyone else to use, modify, and distribute it, provided that
redistributed derivative works do not contain misleading author or
version information. Derivative works need not be licensed under
similar terms.
*/
/*
* ICU modifications:
* - ICU data types and coding conventions
* - ICU string buffer handling with implicit source lengths
* and destination preflighting
* - UTF-16 handling
*/
#include "unicode/utypes.h"
#if !UCONFIG_NO_IDNA
#include "unicode/ustring.h"
#include "unicode/utf.h"
#include "unicode/utf16.h"
#include "ustr_imp.h"
#include "cstring.h"
#include "cmemory.h"
#include "punycode.h"
#include "uassert.h"
/* Punycode ----------------------------------------------------------------- */
/* Punycode parameters for Bootstring */
#define BASE 36
#define TMIN 1
#define TMAX 26
#define SKEW 38
#define DAMP 700
#define INITIAL_BIAS 72
#define INITIAL_N 0x80
/* "Basic" Unicode/ASCII code points */
#define _HYPHEN 0X2d
#define DELIMITER _HYPHEN
#define _ZERO_ 0X30
#define _NINE 0x39
#define _SMALL_A 0X61
#define _SMALL_Z 0X7a
#define _CAPITAL_A 0X41
#define _CAPITAL_Z 0X5a
#define IS_BASIC(c) ((c)<0x80)
#define IS_BASIC_UPPERCASE(c) (_CAPITAL_A<=(c) && (c)<=_CAPITAL_Z)
/**
* digitToBasic() returns the basic code point whose value
* (when used for representing integers) is d, which must be in the
* range 0 to BASE-1. The lowercase form is used unless the uppercase flag is
* nonzero, in which case the uppercase form is used.
*/
static inline char
digitToBasic(int32_t digit, UBool uppercase) {
/* 0..25 map to ASCII a..z or A..Z */
/* 26..35 map to ASCII 0..9 */
if(digit<26) {
if(uppercase) {
return (char)(_CAPITAL_A+digit);
} else {
return (char)(_SMALL_A+digit);
}
} else {
return (char)((_ZERO_-26)+digit);
}
}
/**
* @return the numeric value of a basic code point (for use in representing integers)
* in the range 0 to BASE-1, or a negative value if cp is invalid.
*/
static int32_t decodeDigit(int32_t cp) {
if(cp<=u'Z') {
if(cp<=u'9') {
if(cp<u'0') {
return -1;
} else {
return cp-u'0'+26; // 0..9 -> 26..35
}
} else {
return cp-u'A'; // A-Z -> 0..25
}
} else if(cp<=u'z') {
return cp-'a'; // a..z -> 0..25
} else {
return -1;
}
}
static inline char
asciiCaseMap(char b, UBool uppercase) {
if(uppercase) {
if(_SMALL_A<=b && b<=_SMALL_Z) {
b-=(_SMALL_A-_CAPITAL_A);
}
} else {
if(_CAPITAL_A<=b && b<=_CAPITAL_Z) {
b+=(_SMALL_A-_CAPITAL_A);
}
}
return b;
}
/* Punycode-specific Bootstring code ---------------------------------------- */
/*
* The following code omits the {parts} of the pseudo-algorithm in the spec
* that are not used with the Punycode parameter set.
*/
/* Bias adaptation function. */
static int32_t
adaptBias(int32_t delta, int32_t length, UBool firstTime) {
int32_t count;
if(firstTime) {
delta/=DAMP;
} else {
delta/=2;
}
delta+=delta/length;
for(count=0; delta>((BASE-TMIN)*TMAX)/2; count+=BASE) {
delta/=(BASE-TMIN);
}
return count+(((BASE-TMIN+1)*delta)/(delta+SKEW));
}
namespace {
// ICU-13727: Limit input length for n^2 algorithm
// where well-formed strings are at most 59 characters long.
constexpr int32_t ENCODE_MAX_CODE_UNITS=1000;
constexpr int32_t DECODE_MAX_CHARS=2000;
} // namespace
// encode
U_CAPI int32_t
u_strToPunycode(const char16_t *src, int32_t srcLength,
char16_t *dest, int32_t destCapacity,
const UBool *caseFlags,
UErrorCode *pErrorCode) {
int32_t cpBuffer[ENCODE_MAX_CODE_UNITS];
int32_t n, delta, handledCPCount, basicLength, destLength, bias, j, m, q, k, t, srcCPCount;
char16_t c, c2;
/* argument checking */
if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) {
return 0;
}
if(src==nullptr || srcLength<-1 || (dest==nullptr && destCapacity!=0)) {
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
if (srcLength>ENCODE_MAX_CODE_UNITS) {
*pErrorCode=U_INPUT_TOO_LONG_ERROR;
return 0;
}
/*
* Handle the basic code points and
* convert extended ones to UTF-32 in cpBuffer (caseFlag in sign bit):
*/
srcCPCount=destLength=0;
if(srcLength==-1) {
/* NUL-terminated input */
for(j=0; /* no condition */; ++j) {
if((c=src[j])==0) {
break;
}
if(j>=ENCODE_MAX_CODE_UNITS) {
*pErrorCode=U_INPUT_TOO_LONG_ERROR;
return 0;
}
if(IS_BASIC(c)) {
cpBuffer[srcCPCount++]=0;
if(destLength<destCapacity) {
dest[destLength]=
caseFlags!=nullptr ?
asciiCaseMap((char)c, caseFlags[j]) :
(char)c;
}
++destLength;
} else {
n=(caseFlags!=nullptr && caseFlags[j])<<31L;
if(U16_IS_SINGLE(c)) {
n|=c;
} else if(U16_IS_LEAD(c) && U16_IS_TRAIL(c2=src[j+1])) {
++j;
n|=(int32_t)U16_GET_SUPPLEMENTARY(c, c2);
} else {
/* error: unmatched surrogate */
*pErrorCode=U_INVALID_CHAR_FOUND;
return 0;
}
cpBuffer[srcCPCount++]=n;
}
}
} else {
/* length-specified input */
for(j=0; j<srcLength; ++j) {
c=src[j];
if(IS_BASIC(c)) {
cpBuffer[srcCPCount++]=0;
if(destLength<destCapacity) {
dest[destLength]=
caseFlags!=nullptr ?
asciiCaseMap((char)c, caseFlags[j]) :
(char)c;
}
++destLength;
} else {
n=(caseFlags!=nullptr && caseFlags[j])<<31L;
if(U16_IS_SINGLE(c)) {
n|=c;
} else if(U16_IS_LEAD(c) && (j+1)<srcLength && U16_IS_TRAIL(c2=src[j+1])) {
++j;
n|=(int32_t)U16_GET_SUPPLEMENTARY(c, c2);
} else {
/* error: unmatched surrogate */
*pErrorCode=U_INVALID_CHAR_FOUND;
return 0;
}
cpBuffer[srcCPCount++]=n;
}
}
}
/* Finish the basic string - if it is not empty - with a delimiter. */
basicLength=destLength;
if(basicLength>0) {
if(destLength<destCapacity) {
dest[destLength]=DELIMITER;
}
++destLength;
}
/*
* handledCPCount is the number of code points that have been handled
* basicLength is the number of basic code points
* destLength is the number of chars that have been output
*/
/* Initialize the state: */
n=INITIAL_N;
delta=0;
bias=INITIAL_BIAS;
/* Main encoding loop: */
for(handledCPCount=basicLength; handledCPCount<srcCPCount; /* no op */) {
/*
* All non-basic code points < n have been handled already.
* Find the next larger one:
*/
for(m=0x7fffffff, j=0; j<srcCPCount; ++j) {
q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
if(n<=q && q<m) {
m=q;
}
}
/*
* Increase delta enough to advance the decoder's
* <n,i> state to <m,0>, but guard against overflow:
*/
if(m-n>(0x7fffffff-handledCPCount-delta)/(handledCPCount+1)) {
*pErrorCode=U_INTERNAL_PROGRAM_ERROR;
return 0;
}
delta+=(m-n)*(handledCPCount+1);
n=m;
/* Encode a sequence of same code points n */
for(j=0; j<srcCPCount; ++j) {
q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
if(q<n) {
++delta;
} else if(q==n) {
/* Represent delta as a generalized variable-length integer: */
for(q=delta, k=BASE; /* no condition */; k+=BASE) {
/** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt
t=k-bias;
if(t<TMIN) {
t=TMIN;
} else if(t>TMAX) {
t=TMAX;
}
*/
t=k-bias;
if(t<TMIN) {
t=TMIN;
} else if(k>=(bias+TMAX)) {
t=TMAX;
}
if(q<t) {
break;
}
if(destLength<destCapacity) {
dest[destLength]=digitToBasic(t+(q-t)%(BASE-t), 0);
}
++destLength;
q=(q-t)/(BASE-t);
}
if(destLength<destCapacity) {
dest[destLength]=digitToBasic(q, (UBool)(cpBuffer[j]<0));
}
++destLength;
bias=adaptBias(delta, handledCPCount+1, (UBool)(handledCPCount==basicLength));
delta=0;
++handledCPCount;
}
}
++delta;
++n;
}
return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
}
// decode
U_CAPI int32_t
u_strFromPunycode(const char16_t *src, int32_t srcLength,
char16_t *dest, int32_t destCapacity,
UBool *caseFlags,
UErrorCode *pErrorCode) {
int32_t n, destLength, i, bias, basicLength, j, in, oldi, w, k, digit, t,
destCPCount, firstSupplementaryIndex, cpLength;
char16_t b;
/* argument checking */
if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) {
return 0;
}
if(src==nullptr || srcLength<-1 || (dest==nullptr && destCapacity!=0)) {
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
if(srcLength==-1) {
srcLength=u_strlen(src);
}
if (srcLength>DECODE_MAX_CHARS) {
*pErrorCode=U_INPUT_TOO_LONG_ERROR;
return 0;
}
/*
* Handle the basic code points:
* Let basicLength be the number of input code points
* before the last delimiter, or 0 if there is none,
* then copy the first basicLength code points to the output.
*
* The two following loops iterate backward.
*/
for(j=srcLength; j>0;) {
if(src[--j]==DELIMITER) {
break;
}
}
destLength=basicLength=destCPCount=j;
U_ASSERT(destLength>=0);
while(j>0) {
b=src[--j];
if(!IS_BASIC(b)) {
*pErrorCode=U_INVALID_CHAR_FOUND;
return 0;
}
if(j<destCapacity) {
dest[j]=(char16_t)b;
if(caseFlags!=nullptr) {
caseFlags[j]=IS_BASIC_UPPERCASE(b);
}
}
}
/* Initialize the state: */
n=INITIAL_N;
i=0;
bias=INITIAL_BIAS;
firstSupplementaryIndex=1000000000;
/*
* Main decoding loop:
* Start just after the last delimiter if any
* basic code points were copied; start at the beginning otherwise.
*/
for(in=basicLength>0 ? basicLength+1 : 0; in<srcLength; /* no op */) {
/*
* in is the index of the next character to be consumed, and
* destCPCount is the number of code points in the output array.
*
* Decode a generalized variable-length integer into delta,
* which gets added to i. The overflow checking is easier
* if we increase i as we go, then subtract off its starting
* value at the end to obtain delta.
*/
for(oldi=i, w=1, k=BASE; /* no condition */; k+=BASE) {
if(in>=srcLength) {
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
return 0;
}
digit=decodeDigit(src[in++]);
if(digit<0) {
*pErrorCode=U_INVALID_CHAR_FOUND;
return 0;
}
if(digit>(0x7fffffff-i)/w) {
/* integer overflow */
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
return 0;
}
i+=digit*w;
/** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt
t=k-bias;
if(t<TMIN) {
t=TMIN;
} else if(t>TMAX) {
t=TMAX;
}
*/
t=k-bias;
if(t<TMIN) {
t=TMIN;
} else if(k>=(bias+TMAX)) {
t=TMAX;
}
if(digit<t) {
break;
}
if(w>0x7fffffff/(BASE-t)) {
/* integer overflow */
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
return 0;
}
w*=BASE-t;
}
/*
* Modification from sample code:
* Increments destCPCount here,
* where needed instead of in for() loop tail.
*/
++destCPCount;
bias=adaptBias(i-oldi, destCPCount, (UBool)(oldi==0));
/*
* i was supposed to wrap around from (incremented) destCPCount to 0,
* incrementing n each time, so we'll fix that now:
*/
if(i/destCPCount>(0x7fffffff-n)) {
/* integer overflow */
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
return 0;
}
n+=i/destCPCount;
i%=destCPCount;
/* not needed for Punycode: */
/* if (decode_digit(n) <= BASE) return punycode_invalid_input; */
if(n>0x10ffff || U_IS_SURROGATE(n)) {
/* Unicode code point overflow */
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
return 0;
}
/* Insert n at position i of the output: */
cpLength=U16_LENGTH(n);
if(dest!=nullptr && ((destLength+cpLength)<=destCapacity)) {
int32_t codeUnitIndex;
/*
* Handle indexes when supplementary code points are present.
*
* In almost all cases, there will be only BMP code points before i
* and even in the entire string.
* This is handled with the same efficiency as with UTF-32.
*
* Only the rare cases with supplementary code points are handled
* more slowly - but not too bad since this is an insertion anyway.
*/
if(i<=firstSupplementaryIndex) {
codeUnitIndex=i;
if(cpLength>1) {
firstSupplementaryIndex=codeUnitIndex;
} else {
++firstSupplementaryIndex;
}
} else {
codeUnitIndex=firstSupplementaryIndex;
U16_FWD_N(dest, codeUnitIndex, destLength, i-codeUnitIndex);
}
/* use the char16_t index codeUnitIndex instead of the code point index i */
if(codeUnitIndex<destLength) {
uprv_memmove(dest+codeUnitIndex+cpLength,
dest+codeUnitIndex,
(destLength-codeUnitIndex)*U_SIZEOF_UCHAR);
if(caseFlags!=nullptr) {
uprv_memmove(caseFlags+codeUnitIndex+cpLength,
caseFlags+codeUnitIndex,
destLength-codeUnitIndex);
}
}
if(cpLength==1) {
/* BMP, insert one code unit */
dest[codeUnitIndex]=(char16_t)n;
} else {
/* supplementary character, insert two code units */
dest[codeUnitIndex]=U16_LEAD(n);
dest[codeUnitIndex+1]=U16_TRAIL(n);
}
if(caseFlags!=nullptr) {
/* Case of last character determines uppercase flag: */
caseFlags[codeUnitIndex]=IS_BASIC_UPPERCASE(src[in-1]);
if(cpLength==2) {
caseFlags[codeUnitIndex+1]=false;
}
}
}
destLength+=cpLength;
U_ASSERT(destLength>=0);
++i;
}
return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
}
/* ### check notes on overflow handling - only necessary if not IDNA? are these Punycode functions to be public? */
#endif /* #if !UCONFIG_NO_IDNA */

View file

@ -0,0 +1,120 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
*
* Copyright (C) 2002-2003, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: punycode.h
* encoding: UTF-8
* tab size: 8 (not used)
* indentation:4
*
* created on: 2002jan31
* created by: Markus W. Scherer
*/
/* This ICU code derived from: */
/*
punycode.c 0.4.0 (2001-Nov-17-Sat)
http://www.cs.berkeley.edu/~amc/idn/
Adam M. Costello
http://www.nicemice.net/amc/
*/
#ifndef __PUNYCODE_H__
#define __PUNYCODE_H__
#include "unicode/utypes.h"
#if !UCONFIG_NO_IDNA
/**
* u_strToPunycode() converts Unicode to Punycode.
*
* The input string must not contain single, unpaired surrogates.
* The output will be represented as an array of ASCII code points.
*
* The output string is NUL-terminated according to normal ICU
* string output rules.
*
* @param src Input Unicode string.
* This function handles a limited amount of code points
* (the limit is >=64).
* U_INDEX_OUTOFBOUNDS_ERROR is set if the limit is exceeded.
* @param srcLength Number of UChars in src, or -1 if NUL-terminated.
* @param dest Output Punycode array.
* @param destCapacity Size of dest.
* @param caseFlags Vector of boolean values, one per input UChar,
* indicating that the corresponding character is to be
* marked for the decoder optionally
* uppercasing (true) or lowercasing (false)
* the character.
* ASCII characters are output directly in the case as marked.
* Flags corresponding to trail surrogates are ignored.
* If caseFlags==NULL then input characters are not
* case-mapped.
* @param pErrorCode ICU in/out error code parameter.
* U_INVALID_CHAR_FOUND if src contains
* unmatched single surrogates.
* U_INDEX_OUTOFBOUNDS_ERROR if src contains
* too many code points.
* @return Number of ASCII characters in puny.
*
* @see u_strFromPunycode
*/
U_CAPI int32_t
u_strToPunycode(const UChar *src, int32_t srcLength,
UChar *dest, int32_t destCapacity,
const UBool *caseFlags,
UErrorCode *pErrorCode);
/**
* u_strFromPunycode() converts Punycode to Unicode.
* The Unicode string will be at most as long (in UChars)
* than the Punycode string (in chars).
*
* @param src Input Punycode string.
* @param srcLength Length of puny, or -1 if NUL-terminated
* @param dest Output Unicode string buffer.
* @param destCapacity Size of dest in number of UChars,
* and of caseFlags in numbers of UBools.
* @param caseFlags Output array for case flags as
* defined by the Punycode string.
* The caller should uppercase (true) or lowercase (FASLE)
* the corresponding character in dest.
* For supplementary characters, only the lead surrogate
* is marked, and false is stored for the trail surrogate.
* This is redundant and not necessary for ASCII characters
* because they are already in the case indicated.
* Can be NULL if the case flags are not needed.
* @param pErrorCode ICU in/out error code parameter.
* U_INVALID_CHAR_FOUND if a non-ASCII character
* precedes the last delimiter ('-'),
* or if an invalid character (not a-zA-Z0-9) is found
* after the last delimiter.
* U_ILLEGAL_CHAR_FOUND if the delta sequence is ill-formed.
* @return Number of UChars written to dest.
*
* @see u_strToPunycode
*/
U_CAPI int32_t
u_strFromPunycode(const UChar *src, int32_t srcLength,
UChar *dest, int32_t destCapacity,
UBool *caseFlags,
UErrorCode *pErrorCode);
#endif /* #if !UCONFIG_NO_IDNA */
#endif
/*
* Hey, Emacs, please set the following:
*
* Local Variables:
* indent-tabs-mode: nil
* End:
*
*/

2505
engine/thirdparty/icu4c/common/putil.cpp vendored Normal file

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,615 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
******************************************************************************
*
* Copyright (C) 1997-2016, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
*
* FILE NAME : putilimp.h
*
* Date Name Description
* 10/17/04 grhoten Move internal functions from putil.h to this file.
******************************************************************************
*/
#ifndef PUTILIMP_H
#define PUTILIMP_H
#include "unicode/utypes.h"
#include "unicode/putil.h"
/**
* \def U_SIGNED_RIGHT_SHIFT_IS_ARITHMETIC
* Nearly all CPUs and compilers implement a right-shift of a signed integer
* as an Arithmetic Shift Right which copies the sign bit (the Most Significant Bit (MSB))
* into the vacated bits (sign extension).
* For example, (int32_t)0xfff5fff3>>4 becomes 0xffff5fff and -1>>1=-1.
*
* This can be useful for storing a signed value in the upper bits
* and another bit field in the lower bits.
* The signed value can be retrieved by simple right-shifting.
*
* This is consistent with the Java language.
*
* However, the C standard allows compilers to implement a right-shift of a signed integer
* as a Logical Shift Right which copies a 0 into the vacated bits.
* For example, (int32_t)0xfff5fff3>>4 becomes 0x0fff5fff and -1>>1=0x7fffffff.
*
* Code that depends on the natural behavior should be guarded with this macro,
* with an alternate path for unusual platforms.
* @internal
*/
#ifdef U_SIGNED_RIGHT_SHIFT_IS_ARITHMETIC
/* Use the predefined value. */
#else
/*
* Nearly all CPUs & compilers implement a right-shift of a signed integer
* as an Arithmetic Shift Right (with sign extension).
*/
# define U_SIGNED_RIGHT_SHIFT_IS_ARITHMETIC 1
#endif
/** Define this to 1 if your platform supports IEEE 754 floating point,
to 0 if it does not. */
#ifndef IEEE_754
# define IEEE_754 1
#endif
/**
* uintptr_t is an optional part of the standard definitions in stdint.h.
* The opengroup.org documentation for stdint.h says
* "On XSI-conformant systems, the intptr_t and uintptr_t types are required;
* otherwise, they are optional."
* We assume that when uintptr_t is defined, UINTPTR_MAX is defined as well.
*
* Do not use ptrdiff_t since it is signed. size_t is unsigned.
*/
/* TODO: This check fails on some z environments. Filed a ticket #9357 for this. */
#if !defined(__intptr_t_defined) && !defined(UINTPTR_MAX) && (U_PLATFORM != U_PF_OS390)
typedef size_t uintptr_t;
#endif
/*===========================================================================*/
/** @{ Information about POSIX support */
/*===========================================================================*/
#ifdef U_HAVE_NL_LANGINFO_CODESET
/* Use the predefined value. */
#elif U_PLATFORM_USES_ONLY_WIN32_API || U_PLATFORM == U_PF_ANDROID || U_PLATFORM == U_PF_QNX
# define U_HAVE_NL_LANGINFO_CODESET 0
#else
# define U_HAVE_NL_LANGINFO_CODESET 1
#endif
#ifdef U_NL_LANGINFO_CODESET
/* Use the predefined value. */
#elif !U_HAVE_NL_LANGINFO_CODESET
# define U_NL_LANGINFO_CODESET -1
#elif U_PLATFORM == U_PF_OS400
/* not defined */
#else
# define U_NL_LANGINFO_CODESET CODESET
#endif
#if defined(U_TZSET) || defined(U_HAVE_TZSET)
/* Use the predefined value. */
#elif U_PLATFORM_USES_ONLY_WIN32_API
// UWP doesn't support tzset or environment variables for tz
#if U_PLATFORM_HAS_WINUWP_API == 0
# define U_TZSET _tzset
#endif
#elif U_PLATFORM == U_PF_OS400
/* not defined */
#else
# define U_TZSET tzset
#endif
#if defined(U_TIMEZONE) || defined(U_HAVE_TIMEZONE)
/* Use the predefined value. */
#elif U_PLATFORM == U_PF_ANDROID
# define U_TIMEZONE timezone
#elif defined(__UCLIBC__)
// uClibc does not have __timezone or _timezone.
#elif defined(_NEWLIB_VERSION)
# define U_TIMEZONE _timezone
#elif defined(__GLIBC__)
// glibc
# define U_TIMEZONE __timezone
#elif U_PLATFORM_IS_LINUX_BASED
// not defined
#elif U_PLATFORM_USES_ONLY_WIN32_API
# define U_TIMEZONE _timezone
#elif U_PLATFORM == U_PF_BSD && !defined(__NetBSD__)
/* not defined */
#elif U_PLATFORM == U_PF_OS400
/* not defined */
#elif U_PLATFORM == U_PF_IPHONE
/* not defined */
#else
# define U_TIMEZONE timezone
#endif
#if defined(U_TZNAME) || defined(U_HAVE_TZNAME)
/* Use the predefined value. */
#elif U_PLATFORM_USES_ONLY_WIN32_API
/* not usable on all windows platforms */
#if U_PLATFORM_HAS_WINUWP_API == 0
# define U_TZNAME _tzname
#endif
#elif U_PLATFORM == U_PF_OS400
/* not defined */
#else
# define U_TZNAME tzname
#endif
#ifdef U_HAVE_MMAP
/* Use the predefined value. */
#elif U_PLATFORM_USES_ONLY_WIN32_API
# define U_HAVE_MMAP 0
#else
# define U_HAVE_MMAP 1
#endif
#ifdef U_HAVE_POPEN
/* Use the predefined value. */
#elif U_PLATFORM_USES_ONLY_WIN32_API
# define U_HAVE_POPEN 0
#elif U_PLATFORM == U_PF_OS400
# define U_HAVE_POPEN 0
#else
# define U_HAVE_POPEN 1
#endif
/**
* \def U_HAVE_DIRENT_H
* Defines whether dirent.h is available.
* @internal
*/
#ifdef U_HAVE_DIRENT_H
/* Use the predefined value. */
#elif U_PLATFORM_USES_ONLY_WIN32_API
# define U_HAVE_DIRENT_H 0
#else
# define U_HAVE_DIRENT_H 1
#endif
/** @} */
/*===========================================================================*/
/** @{ Programs used by ICU code */
/*===========================================================================*/
/**
* \def U_MAKE_IS_NMAKE
* Defines whether the "make" program is Windows nmake.
*/
#ifdef U_MAKE_IS_NMAKE
/* Use the predefined value. */
#elif U_PLATFORM == U_PF_WINDOWS
# define U_MAKE_IS_NMAKE 1
#else
# define U_MAKE_IS_NMAKE 0
#endif
/** @} */
/*==========================================================================*/
/* Platform utilities */
/*==========================================================================*/
/**
* Platform utilities isolates the platform dependencies of the
* library. For each platform which this code is ported to, these
* functions may have to be re-implemented.
*/
/**
* Floating point utility to determine if a double is Not a Number (NaN).
* @internal
*/
U_CAPI UBool U_EXPORT2 uprv_isNaN(double d);
/**
* Floating point utility to determine if a double has an infinite value.
* @internal
*/
U_CAPI UBool U_EXPORT2 uprv_isInfinite(double d);
/**
* Floating point utility to determine if a double has a positive infinite value.
* @internal
*/
U_CAPI UBool U_EXPORT2 uprv_isPositiveInfinity(double d);
/**
* Floating point utility to determine if a double has a negative infinite value.
* @internal
*/
U_CAPI UBool U_EXPORT2 uprv_isNegativeInfinity(double d);
/**
* Floating point utility that returns a Not a Number (NaN) value.
* @internal
*/
U_CAPI double U_EXPORT2 uprv_getNaN(void);
/**
* Floating point utility that returns an infinite value.
* @internal
*/
U_CAPI double U_EXPORT2 uprv_getInfinity(void);
/**
* Floating point utility to truncate a double.
* @internal
*/
U_CAPI double U_EXPORT2 uprv_trunc(double d);
/**
* Floating point utility to calculate the floor of a double.
* @internal
*/
U_CAPI double U_EXPORT2 uprv_floor(double d);
/**
* Floating point utility to calculate the ceiling of a double.
* @internal
*/
U_CAPI double U_EXPORT2 uprv_ceil(double d);
/**
* Floating point utility to calculate the absolute value of a double.
* @internal
*/
U_CAPI double U_EXPORT2 uprv_fabs(double d);
/**
* Floating point utility to calculate the fractional and integer parts of a double.
* @internal
*/
U_CAPI double U_EXPORT2 uprv_modf(double d, double* pinteger);
/**
* Floating point utility to calculate the remainder of a double divided by another double.
* @internal
*/
U_CAPI double U_EXPORT2 uprv_fmod(double d, double y);
/**
* Floating point utility to calculate d to the power of exponent (d^exponent).
* @internal
*/
U_CAPI double U_EXPORT2 uprv_pow(double d, double exponent);
/**
* Floating point utility to calculate 10 to the power of exponent (10^exponent).
* @internal
*/
U_CAPI double U_EXPORT2 uprv_pow10(int32_t exponent);
/**
* Floating point utility to calculate the maximum value of two doubles.
* @internal
*/
U_CAPI double U_EXPORT2 uprv_fmax(double d, double y);
/**
* Floating point utility to calculate the minimum value of two doubles.
* @internal
*/
U_CAPI double U_EXPORT2 uprv_fmin(double d, double y);
/**
* Private utility to calculate the maximum value of two integers.
* @internal
*/
U_CAPI int32_t U_EXPORT2 uprv_max(int32_t d, int32_t y);
/**
* Private utility to calculate the minimum value of two integers.
* @internal
*/
U_CAPI int32_t U_EXPORT2 uprv_min(int32_t d, int32_t y);
#if U_IS_BIG_ENDIAN
# define uprv_isNegative(number) (*((signed char *)&(number))<0)
#else
# define uprv_isNegative(number) (*((signed char *)&(number)+sizeof(number)-1)<0)
#endif
/**
* Return the largest positive number that can be represented by an integer
* type of arbitrary bit length.
* @internal
*/
U_CAPI double U_EXPORT2 uprv_maxMantissa(void);
/**
* Floating point utility to calculate the logarithm of a double.
* @internal
*/
U_CAPI double U_EXPORT2 uprv_log(double d);
/**
* Does common notion of rounding e.g. uprv_floor(x + 0.5);
* @param x the double number
* @return the rounded double
* @internal
*/
U_CAPI double U_EXPORT2 uprv_round(double x);
/**
* Adds the signed integers a and b, storing the result in res.
* Checks for signed integer overflow.
* Similar to the GCC/Clang extension __builtin_add_overflow
*
* @param a The first operand.
* @param b The second operand.
* @param res a + b
* @return true if overflow occurred; false if no overflow occurred.
* @internal
*/
U_CAPI UBool U_EXPORT2 uprv_add32_overflow(int32_t a, int32_t b, int32_t* res);
/**
* Multiplies the signed integers a and b, storing the result in res.
* Checks for signed integer overflow.
* Similar to the GCC/Clang extension __builtin_mul_overflow
*
* @param a The first multiplicand.
* @param b The second multiplicand.
* @param res a * b
* @return true if overflow occurred; false if no overflow occurred.
* @internal
*/
U_CAPI UBool U_EXPORT2 uprv_mul32_overflow(int32_t a, int32_t b, int32_t* res);
#if 0
/**
* Returns the number of digits after the decimal point in a double number x.
*
* @param x the double number
* @return the number of digits after the decimal point in a double number x.
* @internal
*/
/*U_CAPI int32_t U_EXPORT2 uprv_digitsAfterDecimal(double x);*/
#endif
#if !U_CHARSET_IS_UTF8
/**
* Please use ucnv_getDefaultName() instead.
* Return the default codepage for this platform and locale.
* This function can call setlocale() on Unix platforms. Please read the
* platform documentation on setlocale() before calling this function.
* @return the default codepage for this platform
* @internal
*/
U_CAPI const char* U_EXPORT2 uprv_getDefaultCodepage(void);
#endif
/**
* Please use uloc_getDefault() instead.
* Return the default locale ID string by querying the system, or
* zero if one cannot be found.
* This function can call setlocale() on Unix platforms. Please read the
* platform documentation on setlocale() before calling this function.
* @return the default locale ID string
* @internal
*/
U_CAPI const char* U_EXPORT2 uprv_getDefaultLocaleID(void);
/**
* Time zone utilities
*
* Wrappers for C runtime library functions relating to timezones.
* The t_tzset() function (similar to tzset) uses the current setting
* of the environment variable TZ to assign values to three global
* variables: daylight, timezone, and tzname. These variables have the
* following meanings, and are declared in &lt;time.h&gt;.
*
* daylight Nonzero if daylight-saving-time zone (DST) is specified
* in TZ; otherwise, 0. Default value is 1.
* timezone Difference in seconds between coordinated universal
* time and local time. E.g., -28,800 for PST (GMT-8hrs)
* tzname(0) Three-letter time-zone name derived from TZ environment
* variable. E.g., "PST".
* tzname(1) Three-letter DST zone name derived from TZ environment
* variable. E.g., "PDT". If DST zone is omitted from TZ,
* tzname(1) is an empty string.
*
* Notes: For example, to set the TZ environment variable to correspond
* to the current time zone in Germany, you can use one of the
* following statements:
*
* set TZ=GST1GDT
* set TZ=GST+1GDT
*
* If the TZ value is not set, t_tzset() attempts to use the time zone
* information specified by the operating system. Under Windows NT
* and Windows 95, this information is specified in the Control Panel's
* Date/Time application.
* @internal
*/
U_CAPI void U_EXPORT2 uprv_tzset(void);
/**
* Difference in seconds between coordinated universal
* time and local time. E.g., -28,800 for PST (GMT-8hrs)
* @return the difference in seconds between coordinated universal time and local time.
* @internal
*/
U_CAPI int32_t U_EXPORT2 uprv_timezone(void);
/**
* tzname(0) Three-letter time-zone name derived from TZ environment
* variable. E.g., "PST".
* tzname(1) Three-letter DST zone name derived from TZ environment
* variable. E.g., "PDT". If DST zone is omitted from TZ,
* tzname(1) is an empty string.
* @internal
*/
U_CAPI const char* U_EXPORT2 uprv_tzname(int n);
/**
* Reset the global tzname cache.
* @internal
*/
U_CAPI void uprv_tzname_clear_cache(void);
/**
* Get UTC (GMT) time measured in milliseconds since 0:00 on 1/1/1970.
* This function is affected by 'faketime' and should be the bottleneck for all user-visible ICU time functions.
* @return the UTC time measured in milliseconds
* @internal
*/
U_CAPI UDate U_EXPORT2 uprv_getUTCtime(void);
/**
* Get UTC (GMT) time measured in milliseconds since 0:00 on 1/1/1970.
* This function is not affected by 'faketime', so it should only be used by low level test functions- not by anything that
* exposes time to the end user.
* @return the UTC time measured in milliseconds
* @internal
*/
U_CAPI UDate U_EXPORT2 uprv_getRawUTCtime(void);
/**
* Determine whether a pathname is absolute or not, as defined by the platform.
* @param path Pathname to test
* @return true if the path is absolute
* @internal (ICU 3.0)
*/
U_CAPI UBool U_EXPORT2 uprv_pathIsAbsolute(const char *path);
/**
* Use U_MAX_PTR instead of this function.
* @param void pointer to test
* @return the largest possible pointer greater than the base
* @internal (ICU 3.8)
*/
U_CAPI void * U_EXPORT2 uprv_maximumPtr(void *base);
/**
* Maximum value of a (void*) - use to indicate the limit of an 'infinite' buffer.
* In fact, buffer sizes must not exceed 2GB so that the difference between
* the buffer limit and the buffer start can be expressed in an int32_t.
*
* The definition of U_MAX_PTR must fulfill the following conditions:
* - return the largest possible pointer greater than base
* - return a valid pointer according to the machine architecture (AS/400, 64-bit, etc.)
* - avoid wrapping around at high addresses
* - make sure that the returned pointer is not farther from base than 0x7fffffff bytes
*
* @param base The beginning of a buffer to find the maximum offset from
* @internal
*/
#ifndef U_MAX_PTR
# if U_PLATFORM == U_PF_OS390 && !defined(_LP64)
/* We have 31-bit pointers. */
# define U_MAX_PTR(base) ((void *)0x7fffffff)
# elif U_PLATFORM == U_PF_OS400
# define U_MAX_PTR(base) uprv_maximumPtr((void *)base)
# elif 0
/*
* For platforms where pointers are scalar values (which is normal, but unlike i5/OS)
* but that do not define uintptr_t.
*
* However, this does not work on modern compilers:
* The C++ standard does not define pointer overflow, and allows compilers to
* assume that p+u>p for any pointer p and any integer u>0.
* Thus, modern compilers optimize away the ">" comparison.
* (See ICU tickets #7187 and #8096.)
*/
# define U_MAX_PTR(base) \
((void *)(((char *)(base)+0x7fffffffu) > (char *)(base) \
? ((char *)(base)+0x7fffffffu) \
: (char *)-1))
# else
/* Default version. C++ standard compliant for scalar pointers. */
# define U_MAX_PTR(base) \
((void *)(((uintptr_t)(base)+0x7fffffffu) > (uintptr_t)(base) \
? ((uintptr_t)(base)+0x7fffffffu) \
: (uintptr_t)-1))
# endif
#endif
#ifdef __cplusplus
/**
* Pin a buffer capacity such that doing pointer arithmetic
* on the destination pointer and capacity cannot overflow.
*
* The pinned capacity must fulfill the following conditions (for positive capacities):
* - dest + capacity is a valid pointer according to the machine architecture (AS/400, 64-bit, etc.)
* - (dest + capacity) >= dest
* - The size (in bytes) of T[capacity] does not exceed 0x7fffffff
*
* @param dest the destination buffer pointer.
* @param capacity the requested buffer capacity, in units of type T.
* @return the pinned capacity.
* @internal
*/
template <typename T>
inline int32_t pinCapacity(T *dest, int32_t capacity) {
if (capacity <= 0) { return capacity; }
uintptr_t destInt = (uintptr_t)dest;
uintptr_t maxInt;
# if U_PLATFORM == U_PF_OS390 && !defined(_LP64)
// We have 31-bit pointers.
maxInt = 0x7fffffff;
# elif U_PLATFORM == U_PF_OS400
maxInt = (uintptr_t)uprv_maximumPtr((void *)dest);
# else
maxInt = destInt + 0x7fffffffu;
if (maxInt < destInt) {
// Less than 2GB to the end of the address space.
// Pin to that to prevent address overflow.
maxInt = (uintptr_t)-1;
}
# endif
uintptr_t maxBytes = maxInt - destInt; // max. 2GB
int32_t maxCapacity = (int32_t)(maxBytes / sizeof(T));
return capacity <= maxCapacity ? capacity : maxCapacity;
}
#endif // __cplusplus
/* Dynamic Library Functions */
typedef void (UVoidFunction)(void);
#if U_ENABLE_DYLOAD
/**
* Load a library
* @internal (ICU 4.4)
*/
U_CAPI void * U_EXPORT2 uprv_dl_open(const char *libName, UErrorCode *status);
/**
* Close a library
* @internal (ICU 4.4)
*/
U_CAPI void U_EXPORT2 uprv_dl_close( void *lib, UErrorCode *status);
/**
* Extract a symbol from a library (function)
* @internal (ICU 4.8)
*/
U_CAPI UVoidFunction* U_EXPORT2 uprv_dlsym_func( void *lib, const char *symbolName, UErrorCode *status);
/**
* Extract a symbol from a library (function)
* Not implemented, no clients.
* @internal
*/
/* U_CAPI void * U_EXPORT2 uprv_dlsym_data( void *lib, const char *symbolName, UErrorCode *status); */
#endif
/**
* Define malloc and related functions
* @internal
*/
#if U_PLATFORM == U_PF_OS400
# define uprv_default_malloc(x) _C_TS_malloc(x)
# define uprv_default_realloc(x,y) _C_TS_realloc(x,y)
# define uprv_default_free(x) _C_TS_free(x)
/* also _C_TS_calloc(x) */
#else
/* C defaults */
# define uprv_default_malloc(x) malloc(x)
# define uprv_default_realloc(x,y) realloc(x,y)
# define uprv_default_free(x) free(x)
#endif
#endif

1303
engine/thirdparty/icu4c/common/rbbi.cpp vendored Normal file

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,698 @@
// Copyright (C) 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
// file: rbbi_cache.cpp
#include "unicode/utypes.h"
#if !UCONFIG_NO_BREAK_ITERATION
#include "unicode/ubrk.h"
#include "unicode/rbbi.h"
#include "rbbi_cache.h"
#include "brkeng.h"
#include "cmemory.h"
#include "rbbidata.h"
#include "rbbirb.h"
#include "uassert.h"
#include "uvectr32.h"
U_NAMESPACE_BEGIN
/*
* DictionaryCache implementation
*/
RuleBasedBreakIterator::DictionaryCache::DictionaryCache(RuleBasedBreakIterator *bi, UErrorCode &status) :
fBI(bi), fBreaks(status), fPositionInCache(-1),
fStart(0), fLimit(0), fFirstRuleStatusIndex(0), fOtherRuleStatusIndex(0) {
}
RuleBasedBreakIterator::DictionaryCache::~DictionaryCache() {
}
void RuleBasedBreakIterator::DictionaryCache::reset() {
fPositionInCache = -1;
fStart = 0;
fLimit = 0;
fFirstRuleStatusIndex = 0;
fOtherRuleStatusIndex = 0;
fBreaks.removeAllElements();
}
UBool RuleBasedBreakIterator::DictionaryCache::following(int32_t fromPos, int32_t *result, int32_t *statusIndex) {
if (fromPos >= fLimit || fromPos < fStart) {
fPositionInCache = -1;
return false;
}
// Sequential iteration, move from previous boundary to the following
int32_t r = 0;
if (fPositionInCache >= 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAti(fPositionInCache) == fromPos) {
++fPositionInCache;
if (fPositionInCache >= fBreaks.size()) {
fPositionInCache = -1;
return false;
}
r = fBreaks.elementAti(fPositionInCache);
U_ASSERT(r > fromPos);
*result = r;
*statusIndex = fOtherRuleStatusIndex;
return true;
}
// Random indexing. Linear search for the boundary following the given position.
for (fPositionInCache = 0; fPositionInCache < fBreaks.size(); ++fPositionInCache) {
r= fBreaks.elementAti(fPositionInCache);
if (r > fromPos) {
*result = r;
*statusIndex = fOtherRuleStatusIndex;
return true;
}
}
UPRV_UNREACHABLE_EXIT;
}
UBool RuleBasedBreakIterator::DictionaryCache::preceding(int32_t fromPos, int32_t *result, int32_t *statusIndex) {
if (fromPos <= fStart || fromPos > fLimit) {
fPositionInCache = -1;
return false;
}
if (fromPos == fLimit) {
fPositionInCache = fBreaks.size() - 1;
if (fPositionInCache >= 0) {
U_ASSERT(fBreaks.elementAti(fPositionInCache) == fromPos);
}
}
int32_t r;
if (fPositionInCache > 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAti(fPositionInCache) == fromPos) {
--fPositionInCache;
r = fBreaks.elementAti(fPositionInCache);
U_ASSERT(r < fromPos);
*result = r;
*statusIndex = ( r== fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex;
return true;
}
if (fPositionInCache == 0) {
fPositionInCache = -1;
return false;
}
for (fPositionInCache = fBreaks.size()-1; fPositionInCache >= 0; --fPositionInCache) {
r = fBreaks.elementAti(fPositionInCache);
if (r < fromPos) {
*result = r;
*statusIndex = ( r == fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex;
return true;
}
}
UPRV_UNREACHABLE_EXIT;
}
void RuleBasedBreakIterator::DictionaryCache::populateDictionary(int32_t startPos, int32_t endPos,
int32_t firstRuleStatus, int32_t otherRuleStatus) {
if ((endPos - startPos) <= 1) {
return;
}
reset();
fFirstRuleStatusIndex = firstRuleStatus;
fOtherRuleStatusIndex = otherRuleStatus;
int32_t rangeStart = startPos;
int32_t rangeEnd = endPos;
uint16_t category;
int32_t current;
UErrorCode status = U_ZERO_ERROR;
int32_t foundBreakCount = 0;
UText *text = &fBI->fText;
// Loop through the text, looking for ranges of dictionary characters.
// For each span, find the appropriate break engine, and ask it to find
// any breaks within the span.
utext_setNativeIndex(text, rangeStart);
UChar32 c = utext_current32(text);
category = ucptrie_get(fBI->fData->fTrie, c);
uint32_t dictStart = fBI->fData->fForwardTable->fDictCategoriesStart;
while(U_SUCCESS(status)) {
while((current = (int32_t)UTEXT_GETNATIVEINDEX(text)) < rangeEnd
&& (category < dictStart)) {
utext_next32(text); // TODO: cleaner loop structure.
c = utext_current32(text);
category = ucptrie_get(fBI->fData->fTrie, c);
}
if (current >= rangeEnd) {
break;
}
// We now have a dictionary character. Get the appropriate language object
// to deal with it.
const LanguageBreakEngine *lbe = fBI->getLanguageBreakEngine(
c, fBI->getLocaleID(ULOC_REQUESTED_LOCALE, status));
// Ask the language object if there are any breaks. It will add them to the cache and
// leave the text pointer on the other side of its range, ready to search for the next one.
if (lbe != nullptr) {
foundBreakCount += lbe->findBreaks(text, current, rangeEnd, fBreaks, fBI->fIsPhraseBreaking, status);
}
// Reload the loop variables for the next go-round
c = utext_current32(text);
category = ucptrie_get(fBI->fData->fTrie, c);
}
// If we found breaks, ensure that the first and last entries are
// the original starting and ending position. And initialize the
// cache iteration position to the first entry.
// printf("foundBreakCount = %d\n", foundBreakCount);
if (foundBreakCount > 0) {
U_ASSERT(foundBreakCount == fBreaks.size());
if (startPos < fBreaks.elementAti(0)) {
// The dictionary did not place a boundary at the start of the segment of text.
// Add one now. This should not commonly happen, but it would be easy for interactions
// of the rules for dictionary segments and the break engine implementations to
// inadvertently cause it. Cover it here, just in case.
fBreaks.insertElementAt(startPos, 0, status);
}
if (endPos > fBreaks.peeki()) {
fBreaks.push(endPos, status);
}
fPositionInCache = 0;
// Note: Dictionary matching may extend beyond the original limit.
fStart = fBreaks.elementAti(0);
fLimit = fBreaks.peeki();
} else {
// there were no language-based breaks, even though the segment contained
// dictionary characters. Subsequent attempts to fetch boundaries from the dictionary cache
// for this range will fail, and the calling code will fall back to the rule based boundaries.
}
}
/*
* BreakCache implementation
*/
RuleBasedBreakIterator::BreakCache::BreakCache(RuleBasedBreakIterator *bi, UErrorCode &status) :
fBI(bi), fSideBuffer(status) {
reset();
}
RuleBasedBreakIterator::BreakCache::~BreakCache() {
}
void RuleBasedBreakIterator::BreakCache::reset(int32_t pos, int32_t ruleStatus) {
fStartBufIdx = 0;
fEndBufIdx = 0;
fTextIdx = pos;
fBufIdx = 0;
fBoundaries[0] = pos;
fStatuses[0] = (uint16_t)ruleStatus;
}
int32_t RuleBasedBreakIterator::BreakCache::current() {
fBI->fPosition = fTextIdx;
fBI->fRuleStatusIndex = fStatuses[fBufIdx];
fBI->fDone = false;
return fTextIdx;
}
void RuleBasedBreakIterator::BreakCache::following(int32_t startPos, UErrorCode &status) {
if (U_FAILURE(status)) {
return;
}
if (startPos == fTextIdx || seek(startPos) || populateNear(startPos, status)) {
// startPos is in the cache. Do a next() from that position.
// TODO: an awkward set of interactions with bi->fDone
// seek() does not clear it; it can't because of interactions with populateNear().
// next() does not clear it in the fast-path case, where everything matters. Maybe it should.
// So clear it here, for the case where seek() succeeded on an iterator that had previously run off the end.
fBI->fDone = false;
next();
}
}
void RuleBasedBreakIterator::BreakCache::preceding(int32_t startPos, UErrorCode &status) {
if (U_FAILURE(status)) {
return;
}
if (startPos == fTextIdx || seek(startPos) || populateNear(startPos, status)) {
if (startPos == fTextIdx) {
previous(status);
} else {
// seek() leaves the BreakCache positioned at the preceding boundary
// if the requested position is between two boundaries.
// current() pushes the BreakCache position out to the BreakIterator itself.
U_ASSERT(startPos > fTextIdx);
current();
}
}
}
/*
* Out-of-line code for BreakCache::next().
* Cache does not already contain the boundary
*/
void RuleBasedBreakIterator::BreakCache::nextOL() {
fBI->fDone = !populateFollowing();
fBI->fPosition = fTextIdx;
fBI->fRuleStatusIndex = fStatuses[fBufIdx];
}
void RuleBasedBreakIterator::BreakCache::previous(UErrorCode &status) {
if (U_FAILURE(status)) {
return;
}
int32_t initialBufIdx = fBufIdx;
if (fBufIdx == fStartBufIdx) {
// At start of cache. Prepend to it.
populatePreceding(status);
} else {
// Cache already holds the next boundary
fBufIdx = modChunkSize(fBufIdx - 1);
fTextIdx = fBoundaries[fBufIdx];
}
fBI->fDone = (fBufIdx == initialBufIdx);
fBI->fPosition = fTextIdx;
fBI->fRuleStatusIndex = fStatuses[fBufIdx];
}
UBool RuleBasedBreakIterator::BreakCache::seek(int32_t pos) {
if (pos < fBoundaries[fStartBufIdx] || pos > fBoundaries[fEndBufIdx]) {
return false;
}
if (pos == fBoundaries[fStartBufIdx]) {
// Common case: seek(0), from BreakIterator::first()
fBufIdx = fStartBufIdx;
fTextIdx = fBoundaries[fBufIdx];
return true;
}
if (pos == fBoundaries[fEndBufIdx]) {
fBufIdx = fEndBufIdx;
fTextIdx = fBoundaries[fBufIdx];
return true;
}
int32_t min = fStartBufIdx;
int32_t max = fEndBufIdx;
while (min != max) {
int32_t probe = (min + max + (min>max ? CACHE_SIZE : 0)) / 2;
probe = modChunkSize(probe);
if (fBoundaries[probe] > pos) {
max = probe;
} else {
min = modChunkSize(probe + 1);
}
}
U_ASSERT(fBoundaries[max] > pos);
fBufIdx = modChunkSize(max - 1);
fTextIdx = fBoundaries[fBufIdx];
U_ASSERT(fTextIdx <= pos);
return true;
}
UBool RuleBasedBreakIterator::BreakCache::populateNear(int32_t position, UErrorCode &status) {
if (U_FAILURE(status)) {
return false;
}
U_ASSERT(position < fBoundaries[fStartBufIdx] || position > fBoundaries[fEndBufIdx]);
// Add boundaries to the cache near the specified position.
// The given position need not be a boundary itself.
// The input position must be within the range of the text, and
// on a code point boundary.
// If the requested position is a break boundary, leave the iteration
// position on it.
// If the requested position is not a boundary, leave the iteration
// position on the preceding boundary and include both the
// preceding and following boundaries in the cache.
// Additional boundaries, either preceding or following, may be added
// to the cache as a side effect.
// If the requested position is not near already cached positions, clear the existing cache,
// find a near-by boundary and begin new cache contents there.
// Threshold for a text position to be considered near to existing cache contents.
// TODO: See issue ICU-22024 "perf tuning of Cache needed."
// This value is subject to change. See the ticket for more details.
static constexpr int32_t CACHE_NEAR = 15;
int32_t aBoundary = -1;
int32_t ruleStatusIndex = 0;
bool retainCache = false;
if ((position > fBoundaries[fStartBufIdx] - CACHE_NEAR) && position < (fBoundaries[fEndBufIdx] + CACHE_NEAR)) {
// Requested position is near the existing cache. Retain it.
retainCache = true;
} else if (position <= CACHE_NEAR) {
// Requested position is near the start of the text. Fill cache from start, skipping
// the need to find a safe point.
retainCache = false;
aBoundary = 0;
} else {
// Requested position is not near the existing cache.
// Find a safe point to refill the cache from.
int32_t backupPos = fBI->handleSafePrevious(position);
if (fBoundaries[fEndBufIdx] < position && fBoundaries[fEndBufIdx] >= (backupPos - CACHE_NEAR)) {
// The requested position is beyond the end of the existing cache, but the
// reverse rules produced a position near or before the cached region.
// Retain the existing cache, and fill from the end of it.
retainCache = true;
} else if (backupPos < CACHE_NEAR) {
// The safe reverse rules moved us to near the start of text.
// Take that (index 0) as the backup boundary, avoiding the complication
// (in the following block) of moving forward from the safe point to a known boundary.
//
// Retain the cache if it begins not too far from the requested position.
aBoundary = 0;
retainCache = (fBoundaries[fStartBufIdx] <= (position + CACHE_NEAR));
} else {
// The safe reverse rules produced a position that is neither near the existing
// cache, nor near the start of text.
// Advance to the boundary following.
// There is a complication: the safe reverse rules identify pairs of code points
// that are safe. If advancing from the safe point moves forwards by less than
// two code points, we need to advance one more time to ensure that the boundary
// is good, including a correct rules status value.
retainCache = false;
fBI->fPosition = backupPos;
aBoundary = fBI->handleNext();
if (aBoundary != UBRK_DONE && aBoundary <= backupPos + 4) {
// +4 is a quick test for possibly having advanced only one codepoint.
// Four being the length of the longest potential code point, a supplementary in UTF-8
utext_setNativeIndex(&fBI->fText, aBoundary);
if (backupPos == utext_getPreviousNativeIndex(&fBI->fText)) {
// The initial handleNext() only advanced by a single code point. Go again.
aBoundary = fBI->handleNext(); // Safe rules identify safe pairs.
}
}
if (aBoundary == UBRK_DONE) {
// Note (Andy Heninger): I don't think this condition can occur, but it's hard
// to prove that it can't. We ran off the end of the string looking a boundary
// following a safe point; choose the end of the string as that boundary.
aBoundary = utext_nativeLength(&fBI->fText);
}
ruleStatusIndex = fBI->fRuleStatusIndex;
}
}
if (!retainCache) {
U_ASSERT(aBoundary != -1);
reset(aBoundary, ruleStatusIndex); // Reset cache to hold aBoundary as a single starting point.
}
// Fill in boundaries between existing cache content and the new requested position.
if (fBoundaries[fEndBufIdx] < position) {
// The last position in the cache precedes the requested position.
// Add following position(s) to the cache.
while (fBoundaries[fEndBufIdx] < position) {
if (!populateFollowing()) {
UPRV_UNREACHABLE_EXIT;
}
}
fBufIdx = fEndBufIdx; // Set iterator position to the end of the buffer.
fTextIdx = fBoundaries[fBufIdx]; // Required because populateFollowing may add extra boundaries.
while (fTextIdx > position) { // Move backwards to a position at or preceding the requested pos.
previous(status);
}
return true;
}
if (fBoundaries[fStartBufIdx] > position) {
// The first position in the cache is beyond the requested position.
// back up more until we get a boundary <= the requested position.
while (fBoundaries[fStartBufIdx] > position) {
populatePreceding(status);
}
fBufIdx = fStartBufIdx; // Set iterator position to the start of the buffer.
fTextIdx = fBoundaries[fBufIdx]; // Required because populatePreceding may add extra boundaries.
while (fTextIdx < position) { // Move forwards to a position at or following the requested pos.
next();
}
if (fTextIdx > position) {
// If position is not itself a boundary, the next() loop above will overshoot.
// Back up one, leaving cache position at the boundary preceding the requested position.
previous(status);
}
return true;
}
U_ASSERT(fTextIdx == position);
return true;
}
UBool RuleBasedBreakIterator::BreakCache::populateFollowing() {
int32_t fromPosition = fBoundaries[fEndBufIdx];
int32_t fromRuleStatusIdx = fStatuses[fEndBufIdx];
int32_t pos = 0;
int32_t ruleStatusIdx = 0;
if (fBI->fDictionaryCache->following(fromPosition, &pos, &ruleStatusIdx)) {
addFollowing(pos, ruleStatusIdx, UpdateCachePosition);
return true;
}
fBI->fPosition = fromPosition;
pos = fBI->handleNext();
if (pos == UBRK_DONE) {
return false;
}
ruleStatusIdx = fBI->fRuleStatusIndex;
if (fBI->fDictionaryCharCount > 0) {
// The text segment obtained from the rules includes dictionary characters.
// Subdivide it, with subdivided results going into the dictionary cache.
fBI->fDictionaryCache->populateDictionary(fromPosition, pos, fromRuleStatusIdx, ruleStatusIdx);
if (fBI->fDictionaryCache->following(fromPosition, &pos, &ruleStatusIdx)) {
addFollowing(pos, ruleStatusIdx, UpdateCachePosition);
return true;
// TODO: may want to move a sizable chunk of dictionary cache to break cache at this point.
// But be careful with interactions with populateNear().
}
}
// Rule based segment did not include dictionary characters.
// Or, it did contain dictionary chars, but the dictionary segmenter didn't handle them,
// meaning that we didn't take the return, above.
// Add its end point to the cache.
addFollowing(pos, ruleStatusIdx, UpdateCachePosition);
// Add several non-dictionary boundaries at this point, to optimize straight forward iteration.
// (subsequent calls to BreakIterator::next() will take the fast path, getting cached results.
//
for (int count=0; count<6; ++count) {
pos = fBI->handleNext();
if (pos == UBRK_DONE || fBI->fDictionaryCharCount > 0) {
break;
}
addFollowing(pos, fBI->fRuleStatusIndex, RetainCachePosition);
}
return true;
}
UBool RuleBasedBreakIterator::BreakCache::populatePreceding(UErrorCode &status) {
if (U_FAILURE(status)) {
return false;
}
int32_t fromPosition = fBoundaries[fStartBufIdx];
if (fromPosition == 0) {
return false;
}
int32_t position = 0;
int32_t positionStatusIdx = 0;
if (fBI->fDictionaryCache->preceding(fromPosition, &position, &positionStatusIdx)) {
addPreceding(position, positionStatusIdx, UpdateCachePosition);
return true;
}
int32_t backupPosition = fromPosition;
// Find a boundary somewhere preceding the first already-cached boundary
do {
backupPosition = backupPosition - 30;
if (backupPosition <= 0) {
backupPosition = 0;
} else {
backupPosition = fBI->handleSafePrevious(backupPosition);
}
if (backupPosition == UBRK_DONE || backupPosition == 0) {
position = 0;
positionStatusIdx = 0;
} else {
// Advance to the boundary following the backup position.
// There is a complication: the safe reverse rules identify pairs of code points
// that are safe. If advancing from the safe point moves forwards by less than
// two code points, we need to advance one more time to ensure that the boundary
// is good, including a correct rules status value.
//
fBI->fPosition = backupPosition;
position = fBI->handleNext();
if (position <= backupPosition + 4) {
// +4 is a quick test for possibly having advanced only one codepoint.
// Four being the length of the longest potential code point, a supplementary in UTF-8
utext_setNativeIndex(&fBI->fText, position);
if (backupPosition == utext_getPreviousNativeIndex(&fBI->fText)) {
// The initial handleNext() only advanced by a single code point. Go again.
position = fBI->handleNext(); // Safe rules identify safe pairs.
}
}
positionStatusIdx = fBI->fRuleStatusIndex;
}
} while (position >= fromPosition);
// Find boundaries between the one we just located and the first already-cached boundary
// Put them in a side buffer, because we don't yet know where they will fall in the circular cache buffer..
fSideBuffer.removeAllElements();
fSideBuffer.addElement(position, status);
fSideBuffer.addElement(positionStatusIdx, status);
do {
int32_t prevPosition = fBI->fPosition = position;
int32_t prevStatusIdx = positionStatusIdx;
position = fBI->handleNext();
positionStatusIdx = fBI->fRuleStatusIndex;
if (position == UBRK_DONE) {
break;
}
UBool segmentHandledByDictionary = false;
if (fBI->fDictionaryCharCount != 0) {
// Segment from the rules includes dictionary characters.
// Subdivide it, with subdivided results going into the dictionary cache.
int32_t dictSegEndPosition = position;
fBI->fDictionaryCache->populateDictionary(prevPosition, dictSegEndPosition, prevStatusIdx, positionStatusIdx);
while (fBI->fDictionaryCache->following(prevPosition, &position, &positionStatusIdx)) {
segmentHandledByDictionary = true;
U_ASSERT(position > prevPosition);
if (position >= fromPosition) {
break;
}
U_ASSERT(position <= dictSegEndPosition);
fSideBuffer.addElement(position, status);
fSideBuffer.addElement(positionStatusIdx, status);
prevPosition = position;
}
U_ASSERT(position==dictSegEndPosition || position>=fromPosition);
}
if (!segmentHandledByDictionary && position < fromPosition) {
fSideBuffer.addElement(position, status);
fSideBuffer.addElement(positionStatusIdx, status);
}
} while (position < fromPosition);
// Move boundaries from the side buffer to the main circular buffer.
UBool success = false;
if (!fSideBuffer.isEmpty()) {
positionStatusIdx = fSideBuffer.popi();
position = fSideBuffer.popi();
addPreceding(position, positionStatusIdx, UpdateCachePosition);
success = true;
}
while (!fSideBuffer.isEmpty()) {
positionStatusIdx = fSideBuffer.popi();
position = fSideBuffer.popi();
if (!addPreceding(position, positionStatusIdx, RetainCachePosition)) {
// No space in circular buffer to hold a new preceding result while
// also retaining the current cache (iteration) position.
// Bailing out is safe; the cache will refill again if needed.
break;
}
}
return success;
}
void RuleBasedBreakIterator::BreakCache::addFollowing(int32_t position, int32_t ruleStatusIdx, UpdatePositionValues update) {
U_ASSERT(position > fBoundaries[fEndBufIdx]);
U_ASSERT(ruleStatusIdx <= UINT16_MAX);
int32_t nextIdx = modChunkSize(fEndBufIdx + 1);
if (nextIdx == fStartBufIdx) {
fStartBufIdx = modChunkSize(fStartBufIdx + 6); // TODO: experiment. Probably revert to 1.
}
fBoundaries[nextIdx] = position;
fStatuses[nextIdx] = static_cast<uint16_t>(ruleStatusIdx);
fEndBufIdx = nextIdx;
if (update == UpdateCachePosition) {
// Set current position to the newly added boundary.
fBufIdx = nextIdx;
fTextIdx = position;
} else {
// Retaining the original cache position.
// Check if the added boundary wraps around the buffer, and would over-write the original position.
// It's the responsibility of callers of this function to not add too many.
U_ASSERT(nextIdx != fBufIdx);
}
}
bool RuleBasedBreakIterator::BreakCache::addPreceding(int32_t position, int32_t ruleStatusIdx, UpdatePositionValues update) {
U_ASSERT(position < fBoundaries[fStartBufIdx]);
U_ASSERT(ruleStatusIdx <= UINT16_MAX);
int32_t nextIdx = modChunkSize(fStartBufIdx - 1);
if (nextIdx == fEndBufIdx) {
if (fBufIdx == fEndBufIdx && update == RetainCachePosition) {
// Failure. The insertion of the new boundary would claim the buffer position that is the
// current iteration position. And we also want to retain the current iteration position.
// (The buffer is already completely full of entries that precede the iteration position.)
return false;
}
fEndBufIdx = modChunkSize(fEndBufIdx - 1);
}
fBoundaries[nextIdx] = position;
fStatuses[nextIdx] = static_cast<uint16_t>(ruleStatusIdx);
fStartBufIdx = nextIdx;
if (update == UpdateCachePosition) {
fBufIdx = nextIdx;
fTextIdx = position;
}
return true;
}
void RuleBasedBreakIterator::BreakCache::dumpCache() {
#ifdef RBBI_DEBUG
RBBIDebugPrintf("fTextIdx:%d fBufIdx:%d\n", fTextIdx, fBufIdx);
for (int32_t i=fStartBufIdx; ; i=modChunkSize(i+1)) {
RBBIDebugPrintf("%d %d\n", i, fBoundaries[i]);
if (i == fEndBufIdx) {
break;
}
}
#endif
}
U_NAMESPACE_END
#endif // #if !UCONFIG_NO_BREAK_ITERATION

View file

@ -0,0 +1,203 @@
// Copyright (C) 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
// file: rbbi_cache.h
//
#ifndef RBBI_CACHE_H
#define RBBI_CACHE_H
#include "unicode/utypes.h"
#if !UCONFIG_NO_BREAK_ITERATION
#include "unicode/rbbi.h"
#include "unicode/uobject.h"
#include "uvectr32.h"
U_NAMESPACE_BEGIN
/* DictionaryCache stores the boundaries obtained from a run of dictionary characters.
* Dictionary boundaries are moved first to this cache, then from here
* to the main BreakCache, where they may inter-leave with non-dictionary
* boundaries. The public BreakIterator API always fetches directly
* from the main BreakCache, not from here.
*
* In common situations, the number of boundaries in a single dictionary run
* should be quite small, it will be terminated by punctuation, spaces,
* or any other non-dictionary characters. The main BreakCache may end
* up with boundaries from multiple dictionary based runs.
*
* The boundaries are stored in a simple ArrayList (vector), with the
* assumption that they will be accessed sequentially.
*/
class RuleBasedBreakIterator::DictionaryCache: public UMemory {
public:
DictionaryCache(RuleBasedBreakIterator *bi, UErrorCode &status);
~DictionaryCache();
void reset();
UBool following(int32_t fromPos, int32_t *pos, int32_t *statusIndex);
UBool preceding(int32_t fromPos, int32_t *pos, int32_t *statusIndex);
/**
* Populate the cache with the dictionary based boundaries within a region of text.
* @param startPos The start position of a range of text
* @param endPos The end position of a range of text
* @param firstRuleStatus The rule status index that applies to the break at startPos
* @param otherRuleStatus The rule status index that applies to boundaries other than startPos
* @internal
*/
void populateDictionary(int32_t startPos, int32_t endPos,
int32_t firstRuleStatus, int32_t otherRuleStatus);
RuleBasedBreakIterator *fBI;
UVector32 fBreaks; // A vector containing the boundaries.
int32_t fPositionInCache; // Index in fBreaks of last boundary returned by following()
// or preceding(). Optimizes sequential access.
int32_t fStart; // Text position of first boundary in cache.
int32_t fLimit; // Last boundary in cache. Which is the limit of the
// text segment being handled by the dictionary.
int32_t fFirstRuleStatusIndex; // Rule status info for first boundary.
int32_t fOtherRuleStatusIndex; // Rule status info for 2nd through last boundaries.
};
/*
* class BreakCache
*
* Cache of break boundary positions and rule status values.
* Break iterator API functions, next(), previous(), etc., will use cached results
* when possible, and otherwise cache new results as they are obtained.
*
* Uniformly caches both dictionary and rule based (non-dictionary) boundaries.
*
* The cache is implemented as a single circular buffer.
*/
/*
* size of the circular cache buffer.
*/
class RuleBasedBreakIterator::BreakCache: public UMemory {
public:
BreakCache(RuleBasedBreakIterator *bi, UErrorCode &status);
virtual ~BreakCache();
void reset(int32_t pos = 0, int32_t ruleStatus = 0);
void next() { if (fBufIdx == fEndBufIdx) {
nextOL();
} else {
fBufIdx = modChunkSize(fBufIdx + 1);
fTextIdx = fBI->fPosition = fBoundaries[fBufIdx];
fBI->fRuleStatusIndex = fStatuses[fBufIdx];
}
}
void nextOL();
void previous(UErrorCode &status);
// Move the iteration state to the position following the startPosition.
// Input position must be pinned to the input length.
void following(int32_t startPosition, UErrorCode &status);
void preceding(int32_t startPosition, UErrorCode &status);
/*
* Update the state of the public BreakIterator (fBI) to reflect the
* current state of the break iterator cache (this).
*/
int32_t current();
/**
* Add boundaries to the cache near the specified position.
* The given position need not be a boundary itself.
* The input position must be within the range of the text, and
* on a code point boundary.
* If the requested position is a break boundary, leave the iteration
* position on it.
* If the requested position is not a boundary, leave the iteration
* position on the preceding boundary and include both the
* preceding and following boundaries in the cache.
* Additional boundaries, either preceding or following, may be added
* to the cache as a side effect.
*
* Return false if the operation failed.
*/
UBool populateNear(int32_t position, UErrorCode &status);
/**
* Add boundary(s) to the cache following the current last boundary.
* Return false if at the end of the text, and no more boundaries can be added.
* Leave iteration position at the first newly added boundary, or unchanged if no boundary was added.
*/
UBool populateFollowing();
/**
* Add one or more boundaries to the cache preceding the first currently cached boundary.
* Leave the iteration position on the first added boundary.
* Return false if no boundaries could be added (if at the start of the text.)
*/
UBool populatePreceding(UErrorCode &status);
enum UpdatePositionValues {
RetainCachePosition = 0,
UpdateCachePosition = 1
};
/*
* Add the boundary following the current position.
* The current position can be left as it was, or changed to the newly added boundary,
* as specified by the update parameter.
*/
void addFollowing(int32_t position, int32_t ruleStatusIdx, UpdatePositionValues update);
/*
* Add the boundary preceding the current position.
* The current position can be left as it was, or changed to the newly added boundary,
* as specified by the update parameter.
*/
bool addPreceding(int32_t position, int32_t ruleStatusIdx, UpdatePositionValues update);
/**
* Set the cache position to the specified position, or, if the position
* falls between to cached boundaries, to the preceding boundary.
* Fails if the requested position is outside of the range of boundaries currently held by the cache.
* The startPosition must be on a code point boundary.
*
* Return true if successful, false if the specified position is after
* the last cached boundary or before the first.
*/
UBool seek(int32_t startPosition);
void dumpCache();
private:
static inline int32_t modChunkSize(int index) { return index & (CACHE_SIZE - 1); }
static constexpr int32_t CACHE_SIZE = 128;
static_assert((CACHE_SIZE & (CACHE_SIZE-1)) == 0, "CACHE_SIZE must be power of two.");
RuleBasedBreakIterator *fBI;
int32_t fStartBufIdx;
int32_t fEndBufIdx; // inclusive
int32_t fTextIdx;
int32_t fBufIdx;
int32_t fBoundaries[CACHE_SIZE];
uint16_t fStatuses[CACHE_SIZE];
UVector32 fSideBuffer;
};
U_NAMESPACE_END
#endif // #if !UCONFIG_NO_BREAK_ITERATION
#endif // RBBI_CACHE_H

View file

@ -0,0 +1,476 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
***************************************************************************
* Copyright (C) 1999-2014 International Business Machines Corporation *
* and others. All rights reserved. *
***************************************************************************
*/
#include "unicode/utypes.h"
#if !UCONFIG_NO_BREAK_ITERATION
#include "unicode/ucptrie.h"
#include "unicode/utypes.h"
#include "rbbidata.h"
#include "rbbirb.h"
#include "udatamem.h"
#include "cmemory.h"
#include "cstring.h"
#include "umutex.h"
#include "uassert.h"
U_NAMESPACE_BEGIN
//-----------------------------------------------------------------------------
//
// Constructors.
//
//-----------------------------------------------------------------------------
RBBIDataWrapper::RBBIDataWrapper(const RBBIDataHeader *data, UErrorCode &status) {
init0();
init(data, status);
}
RBBIDataWrapper::RBBIDataWrapper(const RBBIDataHeader *data, enum EDontAdopt, UErrorCode &status) {
init0();
init(data, status);
fDontFreeData = true;
}
RBBIDataWrapper::RBBIDataWrapper(UDataMemory* udm, UErrorCode &status) {
init0();
if (U_FAILURE(status)) {
return;
}
const DataHeader *dh = udm->pHeader;
int32_t headerSize = dh->dataHeader.headerSize;
if ( !(headerSize >= 20 &&
dh->info.isBigEndian == U_IS_BIG_ENDIAN &&
dh->info.charsetFamily == U_CHARSET_FAMILY &&
dh->info.dataFormat[0] == 0x42 && // dataFormat="Brk "
dh->info.dataFormat[1] == 0x72 &&
dh->info.dataFormat[2] == 0x6b &&
dh->info.dataFormat[3] == 0x20 &&
isDataVersionAcceptable(dh->info.formatVersion))
) {
status = U_INVALID_FORMAT_ERROR;
return;
}
const char *dataAsBytes = reinterpret_cast<const char *>(dh);
const RBBIDataHeader *rbbidh = reinterpret_cast<const RBBIDataHeader *>(dataAsBytes + headerSize);
init(rbbidh, status);
fUDataMem = udm;
}
UBool RBBIDataWrapper::isDataVersionAcceptable(const UVersionInfo version) {
return RBBI_DATA_FORMAT_VERSION[0] == version[0];
}
//-----------------------------------------------------------------------------
//
// init(). Does most of the work of construction, shared between the
// constructors.
//
//-----------------------------------------------------------------------------
void RBBIDataWrapper::init0() {
fHeader = nullptr;
fForwardTable = nullptr;
fReverseTable = nullptr;
fRuleSource = nullptr;
fRuleStatusTable = nullptr;
fTrie = nullptr;
fUDataMem = nullptr;
fRefCount = 0;
fDontFreeData = true;
}
void RBBIDataWrapper::init(const RBBIDataHeader *data, UErrorCode &status) {
if (U_FAILURE(status)) {
return;
}
fHeader = data;
if (fHeader->fMagic != 0xb1a0 || !isDataVersionAcceptable(fHeader->fFormatVersion)) {
status = U_INVALID_FORMAT_ERROR;
return;
}
// Note: in ICU version 3.2 and earlier, there was a formatVersion 1
// that is no longer supported. At that time fFormatVersion was
// an int32_t field, rather than an array of 4 bytes.
fDontFreeData = false;
if (data->fFTableLen != 0) {
fForwardTable = (RBBIStateTable *)((char *)data + fHeader->fFTable);
}
if (data->fRTableLen != 0) {
fReverseTable = (RBBIStateTable *)((char *)data + fHeader->fRTable);
}
fTrie = ucptrie_openFromBinary(UCPTRIE_TYPE_FAST,
UCPTRIE_VALUE_BITS_ANY,
(uint8_t *)data + fHeader->fTrie,
fHeader->fTrieLen,
nullptr, // *actual length
&status);
if (U_FAILURE(status)) {
return;
}
UCPTrieValueWidth width = ucptrie_getValueWidth(fTrie);
if (!(width == UCPTRIE_VALUE_BITS_8 || width == UCPTRIE_VALUE_BITS_16)) {
status = U_INVALID_FORMAT_ERROR;
return;
}
fRuleSource = ((char *)data + fHeader->fRuleSource);
fRuleString = UnicodeString::fromUTF8(StringPiece(fRuleSource, fHeader->fRuleSourceLen));
U_ASSERT(data->fRuleSourceLen > 0);
fRuleStatusTable = (int32_t *)((char *)data + fHeader->fStatusTable);
fStatusMaxIdx = data->fStatusTableLen / sizeof(int32_t);
fRefCount = 1;
#ifdef RBBI_DEBUG
char *debugEnv = getenv("U_RBBIDEBUG");
if (debugEnv && uprv_strstr(debugEnv, "data")) {this->printData();}
#endif
}
//-----------------------------------------------------------------------------
//
// Destructor. Don't call this - use removeReference() instead.
//
//-----------------------------------------------------------------------------
RBBIDataWrapper::~RBBIDataWrapper() {
U_ASSERT(fRefCount == 0);
ucptrie_close(fTrie);
fTrie = nullptr;
if (fUDataMem) {
udata_close(fUDataMem);
} else if (!fDontFreeData) {
uprv_free((void *)fHeader);
}
}
//-----------------------------------------------------------------------------
//
// Operator == Consider two RBBIDataWrappers to be equal if they
// refer to the same underlying data. Although
// the data wrappers are normally shared between
// iterator instances, it's possible to independently
// open the same data twice, and get two instances, which
// should still be ==.
//
//-----------------------------------------------------------------------------
bool RBBIDataWrapper::operator ==(const RBBIDataWrapper &other) const {
if (fHeader == other.fHeader) {
return true;
}
if (fHeader->fLength != other.fHeader->fLength) {
return false;
}
if (uprv_memcmp(fHeader, other.fHeader, fHeader->fLength) == 0) {
return true;
}
return false;
}
int32_t RBBIDataWrapper::hashCode() {
return fHeader->fFTableLen;
}
//-----------------------------------------------------------------------------
//
// Reference Counting. A single RBBIDataWrapper object is shared among
// however many RulesBasedBreakIterator instances are
// referencing the same data.
//
//-----------------------------------------------------------------------------
void RBBIDataWrapper::removeReference() {
if (umtx_atomic_dec(&fRefCount) == 0) {
delete this;
}
}
RBBIDataWrapper *RBBIDataWrapper::addReference() {
umtx_atomic_inc(&fRefCount);
return this;
}
//-----------------------------------------------------------------------------
//
// getRuleSourceString
//
//-----------------------------------------------------------------------------
const UnicodeString &RBBIDataWrapper::getRuleSourceString() const {
return fRuleString;
}
//-----------------------------------------------------------------------------
//
// print - debugging function to dump the runtime data tables.
//
//-----------------------------------------------------------------------------
#ifdef RBBI_DEBUG
void RBBIDataWrapper::printTable(const char *heading, const RBBIStateTable *table) {
uint32_t c;
uint32_t s;
RBBIDebugPrintf("%s\n", heading);
RBBIDebugPrintf(" fDictCategoriesStart: %d\n", table->fDictCategoriesStart);
RBBIDebugPrintf(" fLookAheadResultsSize: %d\n", table->fLookAheadResultsSize);
RBBIDebugPrintf(" Flags: %4x RBBI_LOOKAHEAD_HARD_BREAK=%s RBBI_BOF_REQUIRED=%s RBBI_8BITS_ROWS=%s\n",
table->fFlags,
table->fFlags & RBBI_LOOKAHEAD_HARD_BREAK ? "T" : "F",
table->fFlags & RBBI_BOF_REQUIRED ? "T" : "F",
table->fFlags & RBBI_8BITS_ROWS ? "T" : "F");
RBBIDebugPrintf("\nState | Acc LA TagIx");
for (c=0; c<fHeader->fCatCount; c++) {RBBIDebugPrintf("%3d ", c);}
RBBIDebugPrintf("\n------|---------------"); for (c=0;c<fHeader->fCatCount; c++) {
RBBIDebugPrintf("----");
}
RBBIDebugPrintf("\n");
if (table == nullptr) {
RBBIDebugPrintf(" N U L L T A B L E\n\n");
return;
}
UBool use8Bits = table->fFlags & RBBI_8BITS_ROWS;
for (s=0; s<table->fNumStates; s++) {
RBBIStateTableRow *row = (RBBIStateTableRow *)
(table->fTableData + (table->fRowLen * s));
if (use8Bits) {
RBBIDebugPrintf("%4d | %3d %3d %3d ", s, row->r8.fAccepting, row->r8.fLookAhead, row->r8.fTagsIdx);
for (c=0; c<fHeader->fCatCount; c++) {
RBBIDebugPrintf("%3d ", row->r8.fNextState[c]);
}
} else {
RBBIDebugPrintf("%4d | %3d %3d %3d ", s, row->r16.fAccepting, row->r16.fLookAhead, row->r16.fTagsIdx);
for (c=0; c<fHeader->fCatCount; c++) {
RBBIDebugPrintf("%3d ", row->r16.fNextState[c]);
}
}
RBBIDebugPrintf("\n");
}
RBBIDebugPrintf("\n");
}
#endif
void RBBIDataWrapper::printData() {
#ifdef RBBI_DEBUG
RBBIDebugPrintf("RBBI Data at %p\n", (void *)fHeader);
RBBIDebugPrintf(" Version = {%d %d %d %d}\n", fHeader->fFormatVersion[0], fHeader->fFormatVersion[1],
fHeader->fFormatVersion[2], fHeader->fFormatVersion[3]);
RBBIDebugPrintf(" total length of data = %d\n", fHeader->fLength);
RBBIDebugPrintf(" number of character categories = %d\n\n", fHeader->fCatCount);
printTable("Forward State Transition Table", fForwardTable);
printTable("Reverse State Transition Table", fReverseTable);
RBBIDebugPrintf("\nOriginal Rules source:\n");
for (int32_t c=0; fRuleSource[c] != 0; c++) {
RBBIDebugPrintf("%c", fRuleSource[c]);
}
RBBIDebugPrintf("\n\n");
#endif
}
U_NAMESPACE_END
U_NAMESPACE_USE
//-----------------------------------------------------------------------------
//
// ubrk_swap - byte swap and char encoding swap of RBBI data
//
//-----------------------------------------------------------------------------
U_CAPI int32_t U_EXPORT2
ubrk_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData,
UErrorCode *status) {
if (status == nullptr || U_FAILURE(*status)) {
return 0;
}
if(ds==nullptr || inData==nullptr || length<-1 || (length>0 && outData==nullptr)) {
*status=U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
//
// Check that the data header is for for break data.
// (Header contents are defined in genbrk.cpp)
//
const UDataInfo *pInfo = (const UDataInfo *)((const char *)inData+4);
if(!( pInfo->dataFormat[0]==0x42 && /* dataFormat="Brk " */
pInfo->dataFormat[1]==0x72 &&
pInfo->dataFormat[2]==0x6b &&
pInfo->dataFormat[3]==0x20 &&
RBBIDataWrapper::isDataVersionAcceptable(pInfo->formatVersion) )) {
udata_printError(ds, "ubrk_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized\n",
pInfo->dataFormat[0], pInfo->dataFormat[1],
pInfo->dataFormat[2], pInfo->dataFormat[3],
pInfo->formatVersion[0]);
*status=U_UNSUPPORTED_ERROR;
return 0;
}
//
// Swap the data header. (This is the generic ICU Data Header, not the RBBI Specific
// RBBIDataHeader). This swap also conveniently gets us
// the size of the ICU d.h., which lets us locate the start
// of the RBBI specific data.
//
int32_t headerSize=udata_swapDataHeader(ds, inData, length, outData, status);
//
// Get the RRBI Data Header, and check that it appears to be OK.
//
const uint8_t *inBytes =(const uint8_t *)inData+headerSize;
RBBIDataHeader *rbbiDH = (RBBIDataHeader *)inBytes;
if (ds->readUInt32(rbbiDH->fMagic) != 0xb1a0 ||
!RBBIDataWrapper::isDataVersionAcceptable(rbbiDH->fFormatVersion) ||
ds->readUInt32(rbbiDH->fLength) < sizeof(RBBIDataHeader)) {
udata_printError(ds, "ubrk_swap(): RBBI Data header is invalid.\n");
*status=U_UNSUPPORTED_ERROR;
return 0;
}
//
// Prefight operation? Just return the size
//
int32_t breakDataLength = ds->readUInt32(rbbiDH->fLength);
int32_t totalSize = headerSize + breakDataLength;
if (length < 0) {
return totalSize;
}
//
// Check that length passed in is consistent with length from RBBI data header.
//
if (length < totalSize) {
udata_printError(ds, "ubrk_swap(): too few bytes (%d after ICU Data header) for break data.\n",
breakDataLength);
*status=U_INDEX_OUTOFBOUNDS_ERROR;
return 0;
}
//
// Swap the Data. Do the data itself first, then the RBBI Data Header, because
// we need to reference the header to locate the data, and an
// inplace swap of the header leaves it unusable.
//
uint8_t *outBytes = (uint8_t *)outData + headerSize;
RBBIDataHeader *outputDH = (RBBIDataHeader *)outBytes;
int32_t tableStartOffset;
int32_t tableLength;
//
// If not swapping in place, zero out the output buffer before starting.
// Individual tables and other data items within are aligned to 8 byte boundaries
// when originally created. Any unused space between items needs to be zero.
//
if (inBytes != outBytes) {
uprv_memset(outBytes, 0, breakDataLength);
}
//
// Each state table begins with several 32 bit fields. Calculate the size
// in bytes of these.
//
int32_t topSize = offsetof(RBBIStateTable, fTableData);
// Forward state table.
tableStartOffset = ds->readUInt32(rbbiDH->fFTable);
tableLength = ds->readUInt32(rbbiDH->fFTableLen);
if (tableLength > 0) {
RBBIStateTable *rbbiST = (RBBIStateTable *)(inBytes+tableStartOffset);
UBool use8Bits = ds->readUInt32(rbbiST->fFlags) & RBBI_8BITS_ROWS;
ds->swapArray32(ds, inBytes+tableStartOffset, topSize,
outBytes+tableStartOffset, status);
// Swap the state table if the table is in 16 bits.
if (use8Bits) {
if (outBytes != inBytes) {
uprv_memmove(outBytes+tableStartOffset+topSize,
inBytes+tableStartOffset+topSize,
tableLength-topSize);
}
} else {
ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize,
outBytes+tableStartOffset+topSize, status);
}
}
// Reverse state table. Same layout as forward table, above.
tableStartOffset = ds->readUInt32(rbbiDH->fRTable);
tableLength = ds->readUInt32(rbbiDH->fRTableLen);
if (tableLength > 0) {
RBBIStateTable *rbbiST = (RBBIStateTable *)(inBytes+tableStartOffset);
UBool use8Bits = ds->readUInt32(rbbiST->fFlags) & RBBI_8BITS_ROWS;
ds->swapArray32(ds, inBytes+tableStartOffset, topSize,
outBytes+tableStartOffset, status);
// Swap the state table if the table is in 16 bits.
if (use8Bits) {
if (outBytes != inBytes) {
uprv_memmove(outBytes+tableStartOffset+topSize,
inBytes+tableStartOffset+topSize,
tableLength-topSize);
}
} else {
ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize,
outBytes+tableStartOffset+topSize, status);
}
}
// Trie table for character categories
ucptrie_swap(ds, inBytes+ds->readUInt32(rbbiDH->fTrie), ds->readUInt32(rbbiDH->fTrieLen),
outBytes+ds->readUInt32(rbbiDH->fTrie), status);
// Source Rules Text. It's UTF8 data
if (outBytes != inBytes) {
uprv_memmove(outBytes+ds->readUInt32(rbbiDH->fRuleSource),
inBytes+ds->readUInt32(rbbiDH->fRuleSource),
ds->readUInt32(rbbiDH->fRuleSourceLen));
}
// Table of rule status values. It's all int_32 values
ds->swapArray32(ds, inBytes+ds->readUInt32(rbbiDH->fStatusTable), ds->readUInt32(rbbiDH->fStatusTableLen),
outBytes+ds->readUInt32(rbbiDH->fStatusTable), status);
// And, last, the header.
// It is all int32_t values except for fFormataVersion, which is an array of four bytes.
// Swap the whole thing as int32_t, then re-swap the one field.
//
ds->swapArray32(ds, inBytes, sizeof(RBBIDataHeader), outBytes, status);
ds->swapArray32(ds, outputDH->fFormatVersion, 4, outputDH->fFormatVersion, status);
return totalSize;
}
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */

Some files were not shown because too many files have changed in this diff Show more