rpms/icu/F-9 icu.icu5691.backport.patch, NONE, 1.1 icu.icu5797.backport.patch, NONE, 1.1 icu.icu6001.backport.patch, NONE, 1.1 icu.icu6002.backport.patch, NONE, 1.1 icu.spec, 1.72, 1.73
Caolan McNamara
caolanm at fedoraproject.org
Thu Jun 11 19:29:15 UTC 2009
Author: caolanm
Update of /cvs/pkgs/rpms/icu/F-9
In directory cvs1.fedora.phx.redhat.com:/tmp/cvs-serv30990
Modified Files:
icu.spec
Added Files:
icu.icu5691.backport.patch icu.icu5797.backport.patch
icu.icu6001.backport.patch icu.icu6002.backport.patch
Log Message:
Resolves: rhbz#505368 CVE-2009-0153 Handle illegal sequences consistently
icu.icu5691.backport.patch:
--- NEW FILE icu.icu5691.backport.patch ---
diff -ru icu.6175/source/common/ucnv2022.c icu/source/common/ucnv2022.c
--- icu.6175/source/common/ucnv2022.c 2009-06-11 13:44:44.000000000 +0100
+++ icu/source/common/ucnv2022.c 2009-06-11 18:09:48.000000000 +0100
@@ -1973,6 +1973,7 @@
mySourceChar = args->converter->toUBytes[0];
args->converter->toULength = 0;
cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
+ targetUniChar = missingCharMarker;
goto getTrailByte;
}
@@ -2102,18 +2103,45 @@
default:
/* G0 DBCS */
if(mySource < mySourceLimit) {
+ int leadIsOk, trailIsOk;
char trailByte;
getTrailByte:
- trailByte = *mySource++;
- if(cs == JISX208) {
- _2022ToSJIS((uint8_t)mySourceChar, (uint8_t)trailByte, tempBuf);
- } else {
- tempBuf[0] = (char)mySourceChar;
- tempBuf[1] = trailByte;
- }
- mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte);
- targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE);
- } else {
+ trailByte = *mySource;
+ /*
+ * Ticket 5691: consistent illegal sequences:
+ * - We include at least the first byte in the illegal sequence.
+ * - If any of the non-initial bytes could be the start of a character,
+ * we stop the illegal sequence before the first one of those.
+ *
+ * In ISO-2022 DBCS, if both bytes are valid or both bytes are outside
+ * the 21..7e range, then we treat them as a pair.
+ * Otherwise (valid lead byte + illegal trail byte, or vice versa)
+ * we report only the first byte as the illegal sequence.
+ */
+ leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
+ trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
+ if (leadIsOk == trailIsOk) {
+ ++mySource;
+ uint32_t tmpSourceChar = (mySourceChar << 8) | (uint8_t)(trailByte);
+ if (leadIsOk) {
+ if(cs == JISX208) {
+ _2022ToSJIS((uint8_t)mySourceChar, (uint8_t)trailByte, tempBuf);
+ mySourceChar = tmpSourceChar;
+ } else {
+ /* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */
+ mySourceChar = tmpSourceChar;
+ if (cs == KSC5601) {
+ tmpSourceChar += 0x8080; /* = _2022ToGR94DBCS(tmpSourceChar) */
+ }
+ tempBuf[0] = (char)(tmpSourceChar >> 8);
+ tempBuf[1] = (char)(tmpSourceChar);
+ }
+ targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE);
+ } else {
+ mySourceChar = tmpSourceChar;
+ }
+ }
+ } else {
args->converter->toUBytes[0] = (uint8_t)mySourceChar;
args->converter->toULength = 1;
goto endloop;
@@ -2254,7 +2282,12 @@
}
/* only DBCS or SBCS characters are expected*/
/* DB characters with high bit set to 1 are expected */
- if(length > 2 || length==0 ||(((targetByteUnit & 0x8080) != 0x8080)&& length==2)){
+ if( length > 2 || length==0 ||
+ (length == 1 && targetByteUnit > 0x7f) ||
+ (length == 2 &&
+ ((uint16_t)(targetByteUnit - 0xa1a1) > (0xfefe - 0xa1a1) ||
+ (uint8_t)(targetByteUnit - 0xa1) > (0xfe - 0xa1)))
+ ) {
targetByteUnit=missingCharMarker;
}
if (targetByteUnit != missingCharMarker){
@@ -2583,17 +2616,36 @@
myData->isEmptySegment = FALSE; /* Any invalid char errors will be detected separately, so just reset this */
if(myData->toU2022State.g == 1) {
if(mySource < mySourceLimit) {
+ int leadIsOk, trailIsOk;
char trailByte;
getTrailByte:
- trailByte = *mySource++;
- tempBuf[0] = (char)(mySourceChar + 0x80);
- tempBuf[1] = (char)(trailByte + 0x80);
- mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte);
- if((mySourceChar & 0x8080) == 0) {
- targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback);
+ targetUniChar = missingCharMarker;
+ trailByte = *mySource;
+ /*
+ * Ticket 5691: consistent illegal sequences:
+ * - We include at least the first byte in the illegal sequence.
+ * - If any of the non-initial bytes could be the start of a character,
+ * we stop the illegal sequence before the first one of those.
+ *
+ * In ISO-2022 DBCS, if both bytes are valid or both bytes are outside
+ * the 21..7e range, then we treat them as a pair.
+ * Otherwise (valid lead byte + illegal trail byte, or vice versa)
+ * we report only the first byte as the illegal sequence.
+ */
+ leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
+ trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
+ if (leadIsOk == trailIsOk) {
+ ++mySource;
+ if (leadIsOk) {
+ tempBuf[0] = (char)(mySourceChar + 0x80);
+ tempBuf[1] = (char)(trailByte + 0x80);
+ targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback);
+ } else {
+ leadIsOk = TRUE; /* TODO: remove */
+ }
+ mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte);
} else {
- /* illegal bytes > 0x7f */
- targetUniChar = missingCharMarker;
+ trailIsOk = TRUE; /* TODO: remove */
}
} else {
args->converter->toUBytes[0] = (uint8_t)mySourceChar;
@@ -2601,8 +2653,10 @@
break;
}
}
- else{
+ else if(mySourceChar <= 0x7f) {
targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySource - 1, 1, useFallback);
+ } else {
+ targetUniChar = 0xffff;
}
if(targetUniChar < 0xfffe){
if(args->offsets) {
@@ -3099,6 +3153,7 @@
/* continue with a partial double-byte character */
mySourceChar = args->converter->toUBytes[0];
args->converter->toULength = 0;
+ targetUniChar = missingCharMarker;
goto getTrailByte;
}
@@ -3178,29 +3233,48 @@
UConverterSharedData *cnv;
StateEnum tempState;
int32_t tempBufLen;
+ int leadIsOk, trailIsOk;
char trailByte;
getTrailByte:
- trailByte = *mySource++;
- tempState = (StateEnum)pToU2022State->cs[pToU2022State->g];
- if(tempState > CNS_11643_0) {
- cnv = myData->myConverterArray[CNS_11643];
- tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0));
- tempBuf[1] = (char) (mySourceChar);
- tempBuf[2] = trailByte;
- tempBufLen = 3;
-
- }else{
- cnv = myData->myConverterArray[tempState];
- tempBuf[0] = (char) (mySourceChar);
- tempBuf[1] = trailByte;
- tempBufLen = 2;
+ trailByte = *mySource;
+ /*
+ * Ticket 5691: consistent illegal sequences:
+ * - We include at least the first byte in the illegal sequence.
+ * - If any of the non-initial bytes could be the start of a character,
+ * we stop the illegal sequence before the first one of those.
+ *
+ * In ISO-2022 DBCS, if both bytes are valid or both bytes are outside
+ * the 21..7e range, then we treat them as a pair.
+ * Otherwise (valid lead byte + illegal trail byte, or vice versa)
+ * we report only the first byte as the illegal sequence.
+ */
+ leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
+ trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
+ if (leadIsOk == trailIsOk) {
+ ++mySource;
+ if (leadIsOk) {
+ tempState = (StateEnum)pToU2022State->cs[pToU2022State->g];
+ if(tempState >= CNS_11643_0) {
+ cnv = myData->myConverterArray[CNS_11643];
+ tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0));
+ tempBuf[1] = (char) (mySourceChar);
+ tempBuf[2] = trailByte;
+ tempBufLen = 3;
+
+ }else{
+ cnv = myData->myConverterArray[tempState];
+ tempBuf[0] = (char) (mySourceChar);
+ tempBuf[1] = trailByte;
+ tempBufLen = 2;
+ }
+ targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, FALSE);
+ }
+ mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte);
}
- mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte);
if(pToU2022State->g>=2) {
/* return from a single-shift state to the previous one */
pToU2022State->g=pToU2022State->prevG;
}
- targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, FALSE);
} else {
args->converter->toUBytes[0] = (uint8_t)mySourceChar;
args->converter->toULength = 1;
diff -ru icu.6175/source/common/ucnvhz.c icu/source/common/ucnvhz.c
--- icu.6175/source/common/ucnvhz.c 2009-06-11 13:44:44.000000000 +0100
+++ icu/source/common/ucnvhz.c 2009-06-11 18:05:36.000000000 +0100
@@ -215,19 +215,35 @@
}
else{
/* trail byte */
+ int leadIsOk, trailIsOk;
uint32_t leadByte = args->converter->toUnicodeStatus & 0xff;
- if( (uint8_t)(leadByte - 0x21) <= (0x7d - 0x21) &&
- (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21)
- ) {
- tempBuf[0] = (char) (leadByte+0x80) ;
- tempBuf[1] = (char) (mySourceChar+0x80);
- targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->gbConverter->sharedData,
- tempBuf, 2, args->converter->useFallback);
+ targetUniChar = 0xffff;
+ /*
+ * Ticket 5691: consistent illegal sequences:
+ * - We include at least the first byte in the illegal sequence.
+ * - If any of the non-initial bytes could be the start of a character,
+ * we stop the illegal sequence before the first one of those.
+ *
+ * In HZ DBCS, if both bytes are valid or both bytes are outside
+ * the 21..7d/7e range, then we treat them as a pair.
+ * Otherwise (valid lead byte + illegal trail byte, or vice versa)
+ * we report only the first byte as the illegal sequence.
+ */
+ leadIsOk = (uint8_t)(leadByte - 0x21) <= (0x7d - 0x21);
+ trailIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
+ if (leadIsOk == trailIsOk) {
+ if (leadIsOk) {
+ tempBuf[0] = (char) (leadByte+0x80) ;
+ tempBuf[1] = (char) (mySourceChar+0x80);
+ targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->gbConverter->sharedData,
+ tempBuf, 2, args->converter->useFallback);
+ }
+ /* add another bit so that the code below writes 2 bytes in case of error */
+ mySourceChar= 0x10000 | (leadByte << 8) | mySourceChar;
} else {
- targetUniChar = 0xffff;
+ --mySource;
+ mySourceChar = (int32_t)leadByte;
}
- /* add another bit so that the code below writes 2 bytes in case of error */
- mySourceChar= 0x10000 | (leadByte << 8) | mySourceChar;
args->converter->toUnicodeStatus =0x00;
}
}
diff -ru icu.6175/source/common/ucnvmbcs.c icu/source/common/ucnvmbcs.c
--- icu.6175/source/common/ucnvmbcs.c 2009-06-11 13:44:44.000000000 +0100
+++ icu/source/common/ucnvmbcs.c 2009-06-11 18:05:36.000000000 +0100
@@ -1,7 +1,7 @@
/*
******************************************************************************
*
-* Copyright (C) 2000-2007, International Business Machines
+* Copyright (C) 2000-2008, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
@@ -1791,6 +1791,65 @@
pArgs->offsets=offsets;
}
+static UBool
+hasValidTrailBytes(const int32_t (*stateTable)[256], uint8_t state) {
+ const int32_t *row=stateTable[state];
+ int32_t b, entry;
+ /* First test for final entries in this state for some commonly valid byte values. */
+ entry=row[0xa1];
+ if( !MBCS_ENTRY_IS_TRANSITION(entry) &&
+ MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL
+ ) {
+ return TRUE;
+ }
+ entry=row[0x41];
+ if( !MBCS_ENTRY_IS_TRANSITION(entry) &&
+ MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL
+ ) {
+ return TRUE;
+ }
+ /* Then test for final entries in this state. */
+ for(b=0; b<=0xff; ++b) {
+ entry=row[b];
+ if( !MBCS_ENTRY_IS_TRANSITION(entry) &&
+ MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL
+ ) {
+ return TRUE;
+ }
+ }
+ /* Then recurse for transition entries. */
+ for(b=0; b<=0xff; ++b) {
+ entry=row[b];
+ if( MBCS_ENTRY_IS_TRANSITION(entry) &&
+ hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry))
+ ) {
+ return TRUE;
+ }
+ }
+ return FALSE;
+}
+
+/*
+ * Is byte b a single/lead byte in this state?
+ * Recurse for transition states, because here we don't want to say that
+ * b is a lead byte if all byte sequences that start with b are illegal.
+ */
+static UBool
+isSingleOrLead(const int32_t (*stateTable)[256], uint8_t state, UBool isDBCSOnly, uint8_t b) {
+ const int32_t *row=stateTable[state];
+ int32_t entry=row[b];
+ if(MBCS_ENTRY_IS_TRANSITION(entry)) { /* lead byte */
+ return hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry));
+ } else {
+ uint8_t action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
+ if(action==MBCS_STATE_CHANGE_ONLY && isDBCSOnly) {
+ return FALSE; /* SI/SO are illegal for DBCS-only conversion */
+ } else {
+ return action!=MBCS_STATE_ILLEGAL;
+ }
+ }
+}
+
U_CFUNC void
ucnv_MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
UErrorCode *pErrorCode) {
@@ -2146,6 +2205,34 @@
sourceIndex=nextSourceIndex;
} else if(U_FAILURE(*pErrorCode)) {
/* callback(illegal) */
+ if(byteIndex>1) {
+ /*
+ * Ticket 5691: consistent illegal sequences:
+ * - We include at least the first byte in the illegal sequence.
+ * - If any of the non-initial bytes could be the start of a character,
+ * we stop the illegal sequence before the first one of those.
+ */
+ UBool isDBCSOnly=(UBool)(cnv->sharedData->mbcs.dbcsOnlyState!=0);
+ int8_t i;
+ for(i=1;
+ i<byteIndex && !isSingleOrLead(stateTable, state, isDBCSOnly, bytes[i]);
+ ++i) {}
+ if(i<byteIndex) {
+ /* Back out some bytes. */
+ int8_t backOutDistance=byteIndex-i;
+ int32_t bytesFromThisBuffer=(int32_t)(source-(const uint8_t *)pArgs->source);
+ byteIndex=i; /* length of reported illegal byte sequence */
+ if(backOutDistance<=bytesFromThisBuffer) {
+ source-=backOutDistance;
+ } else {
+ /* Back out bytes from the previous buffer: Need to replay them. */
+ cnv->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance);
+ /* preToULength is negative! */
+ uprv_memcpy(cnv->preToU, bytes+i, -cnv->preToULength);
+ source=(const uint8_t *)pArgs->source;
+ }
+ }
+ }
break;
} else /* unassigned sequences indicated with byteIndex>0 */ {
/* try an extension mapping */
@@ -2156,6 +2243,7 @@
&offsets, sourceIndex,
pArgs->flush,
pErrorCode);
+ /* TODO: nextSourceIndex+=diff instead of nextSourceIndex+diff ?? */
sourceIndex=nextSourceIndex+(int32_t)(source-(const uint8_t *)pArgs->source);
if(U_FAILURE(*pErrorCode)) {
@@ -2447,15 +2535,37 @@
if(c<0) {
if(U_SUCCESS(*pErrorCode) && source==sourceLimit && lastSource<source) {
- *pErrorCode=U_TRUNCATED_CHAR_FOUND;
- }
- if(U_FAILURE(*pErrorCode)) {
/* incomplete character byte sequence */
uint8_t *bytes=cnv->toUBytes;
cnv->toULength=(int8_t)(source-lastSource);
do {
*bytes++=*lastSource++;
} while(lastSource<source);
+ *pErrorCode=U_TRUNCATED_CHAR_FOUND;
+ } else if(U_FAILURE(*pErrorCode)) {
+ /* callback(illegal) */
+ /*
+ * Ticket 5691: consistent illegal sequences:
+ * - We include at least the first byte in the illegal sequence.
+ * - If any of the non-initial bytes could be the start of a character,
+ * we stop the illegal sequence before the first one of those.
+ */
+ UBool isDBCSOnly=(UBool)(cnv->sharedData->mbcs.dbcsOnlyState!=0);
+ uint8_t *bytes=cnv->toUBytes;
+ *bytes++=*lastSource++; /* first byte */
+ if(lastSource==source) {
+ cnv->toULength=1;
+ } else /* lastSource<source: multi-byte character */ {
+ int8_t i;
+ for(i=1;
+ lastSource<source && !isSingleOrLead(stateTable, state, isDBCSOnly, *lastSource);
+ ++i
+ ) {
+ *bytes++=*lastSource++;
+ }
+ cnv->toULength=i;
+ source=lastSource;
+ }
} else {
/* no output because of empty input or only state changes */
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
diff -ru icu.6175/source/test/cintltst/nccbtst.c icu/source/test/cintltst/nccbtst.c
--- icu.6175/source/test/cintltst/nccbtst.c 2009-06-11 13:44:44.000000000 +0100
+++ icu/source/test/cintltst/nccbtst.c 2009-06-11 18:05:36.000000000 +0100
@@ -2497,13 +2497,13 @@
static const uint8_t text943[] = {
- 0x82, 0xa9, 0x82, 0x20, /*0xc8,*/ 0x61, 0x8a, 0xbf, 0x8e, 0x9a };
- static const UChar toUnicode943sub[] = { 0x304b, 0xfffd, /*0xff88,*/ 0x0061, 0x6f22, 0x5b57};
- static const UChar toUnicode943skip[]= { 0x304b, /*0xff88,*/ 0x0061, 0x6f22, 0x5b57};
+ 0x82, 0xa9, 0x82, 0x20, 0x61, 0x8a, 0xbf, 0x8e, 0x9a };
+ static const UChar toUnicode943sub[] = { 0x304b, 0x1a, 0x20, 0x0061, 0x6f22, 0x5b57 };
+ static const UChar toUnicode943skip[]= { 0x304b, 0x20, 0x0061, 0x6f22, 0x5b57 };
static const UChar toUnicode943stop[]= { 0x304b};
- static const int32_t fromIBM943Offssub[] = {0, 2, 4, 5, 7};
- static const int32_t fromIBM943Offsskip[] = { 0, 4, 5, 7};
+ static const int32_t fromIBM943Offssub[] = { 0, 2, 3, 4, 5, 7 };
+ static const int32_t fromIBM943Offsskip[] = { 0, 3, 4, 5, 7 };
static const int32_t fromIBM943Offsstop[] = { 0};
gInBufferSize = inputsize;
@@ -2537,9 +2537,9 @@
{
static const uint8_t sampleText[] = {
0x82, 0xa9, 0x61, 0x62, 0x63 , 0x82,
- 0xff, /*0x82, 0xa9,*/ 0x32, 0x33};
- static const UChar toUnicode943sub[] = {0x304b, 0x0061, 0x0062, 0x0063, 0xfffd,/*0x304b,*/ 0x0032, 0x0033};
- static const int32_t fromIBM943Offssub[] = {0, 2, 3, 4, 5, 7, 8};
+ 0xff, 0x32, 0x33};
+ static const UChar toUnicode943sub[] = { 0x304b, 0x0061, 0x0062, 0x0063, 0x1a, 0x1a, 0x0032, 0x0033 };
+ static const int32_t fromIBM943Offssub[] = { 0, 2, 3, 4, 5, 6, 7, 8 };
/*checking illegal value for ibm-943 with substitute*/
gInBufferSize = inputsize;
gOutBufferSize = outputsize;
diff -ru icu.6175/source/test/cintltst/nucnvtst.c icu/source/test/cintltst/nucnvtst.c
--- icu.6175/source/test/cintltst/nucnvtst.c 2009-06-11 13:44:44.000000000 +0100
+++ icu/source/test/cintltst/nucnvtst.c 2009-06-11 18:05:36.000000000 +0100
@@ -2608,7 +2608,7 @@
TestNextUCharError(cnv, source, source, U_INDEX_OUTOFBOUNDS_ERROR, "sourceLimit <= source");
/*Test for the condition where there is an invalid character*/
{
- static const uint8_t source2[]={0xa1, 0x01};
+ static const uint8_t source2[]={0xa1, 0x80};
TestNextUCharError(cnv, (const char*)source2, (const char*)source2+sizeof(source2), U_ZERO_ERROR, "an invalid character");
}
/*Test for the condition where we have a truncated char*/
@@ -3901,11 +3901,11 @@
TestISO_2022_KR() {
/* test input */
static const uint16_t in[]={
- 0x9F4B,0x9F4E,0x9F52,0x9F5F,0x9F61,0x9F66,0x9F67,0x9F6A,0x000A,0x000D
- ,0x9F6C,0x9F77,0x9F8D,0x9F90,0x9F95,0x9F9C,0xAC00,0xAC01,0xAC02,0xAC04
+ 0x9F4B,0x9F4E,0x9F52,0x9F5F,0x9F61,0x9F67,0x9F6A,0x000A,0x000D
+ ,0x9F6C,0x9F77,0x9F8D,0x9F90,0x9F95,0x9F9C,0xAC00,0xAC01,0xAC04
,0xAC07,0xAC08,0xAC09,0x0025,0x0026,0x0027,0x000A,0x000D,0x0028,0x0029
,0x002A,0x002B,0x002C,0x002D,0x002E,0x53C3,0x53C8,0x53C9,0x53CA,0x53CB
- ,0x53CD,0x53D4,0x53D6,0x53D7,0x53DB,0x000A,0x000D,0x53DF,0x53E1,0x53E2
+ ,0x53CD,0x53D4,0x53D6,0x53D7,0x53DB,0x000A,0x000D,0x53E1,0x53E2
,0x53E3,0x53E4,0x000A,0x000D};
const UChar* uSource;
const UChar* uSourceLimit;
diff -ru icu.6175/source/test/testdata/conversion.txt icu/source/test/testdata/conversion.txt
--- icu.6175/source/test/testdata/conversion.txt 2009-06-11 13:44:44.000000000 +0100
+++ icu/source/test/testdata/conversion.txt 2009-06-11 18:05:36.000000000 +0100
@@ -48,12 +48,83 @@
toUnicode {
Headers { "charset", "bytes", "unicode", "offsets", "flush", "fallbacks", "errorCode", "callback", "invalidChars" }
Cases {
+ // Test ticket 5691: consistent illegal sequences
+ // Unfortunately, we cannot use the Shift-JIS examples from the ticket
+ // comments because our Shift-JIS table is Windows-compatible and
+ // therefore has no illegal single bytes. Same for GBK.
+ // Instead, we use the stricter GB 18030 also for 2-byte examples.
+ // The byte sequences are generally slightly different from the ticket
+ // comment, simply using assigned characters rather than just
+ // theoretically valid sequences.
+ {
+ "gb18030",
+ :bin{ 618140813c81ff7a },
+ "a\u4e02\\x81<\\x81\\xFFz",
+ :intvector{ 0,1,3,3,3,3,4,5,5,5,5,5,5,5,5,7 },
+ :int{1}, :int{0}, "", "&C", :bin{""}
+ }
+ {
+ "EUC-JP",
+ :bin{ 618fb0a98fb03c8f3cb0a97a },
+ "a\u4e28\\x8F\\xB0<\\x8F<\u9022z",
+ :intvector{ 0,1,4,4,4,4,5,5,5,5,6,7,7,7,7,8,9,11 },
+ :int{1}, :int{0}, "", "&C", :bin{""}
+ }
+ {
+ "gb18030",
+ :bin{ 618130fc318130fc8181303c3e813cfc817a },
+ "a\u05ed\\x810\u9f07\\x810<>\\x81<\u9f07z",
+ :intvector{ 0,1,5,5,5,5,6,7,9,9,9,9,10,11,12,13,13,13,13,14,15,17 },
+ :int{1}, :int{0}, "", "&C", :bin{""}
+ }
+ {
+ "UTF-8",
+ :bin{ 61f1808182f180813cf18081fff180ff3cf1ff3c3e7a },
+ "a\U00040042\\xF1\\x80\\x81<\\xF1\\x80\\x81\\xFF\\xF1\\x80\\xFF<\\xF1\\xFF<>z",
+ :intvector{ 0,1,1,5,5,5,5,5,5,5,5,5,5,5,5,8,9,9,9,9,9,9,9,9,9,9,9,9,12,12,12,12,13,13,13,13,13,13,13,13,15,15,15,15,16,17,17,17,17,18,18,18,18,19,20,21 },
+ :int{1}, :int{0}, "", "&C", :bin{""}
+ }
+ {
+ "ISO-2022-JP-2",
+ :bin{ 1b24424141af4142affe41431b2842 },
+ "\u758f\\xAF\u758e\\xAF\\xFE\u790e",
+ :intvector{ 3,5,5,5,5,6,8,8,8,8,8,8,8,8,10 },
+ :int{1}, :int{0}, "", "&C", :bin{""}
+ }
+ {
+ "ibm-25546",
+ :bin{ 411b242943420e4141af4142affe41430f5a },
+ "AB\uc88b\\xAF\uc88c\\xAF\\xFE\uc88dZ",
+ :intvector{ 0,5,7,9,9,9,9,10,12,12,12,12,12,12,12,12,14,17 },
+ :int{1}, :int{0}, "", "&C", :bin{""}
+ }
+ {
+ "ISO-2022-KR",
+ :bin{ 411b242943420e4141af4142affe41430f5a },
+ "AB\uc88b\\xAF\uc88c\\xAF\\xFE\uc88dZ",
+ :intvector{ 0,5,7,9,9,9,9,10,12,12,12,12,12,12,12,12,14,17 },
+ :int{1}, :int{0}, "", "&C", :bin{""}
+ }
+ {
+ "ISO-2022-CN",
+ :bin{ 411b242941420e4141af4142affe41430f5a },
+ "AB\u4eae\\xAF\u8c05\\xAF\\xFE\u64a9Z",
+ :intvector{ 0,5,7,9,9,9,9,10,12,12,12,12,12,12,12,12,14,17 },
+ :int{1}, :int{0}, "", "&C", :bin{""}
+ }
+ {
+ "HZ",
+ :bin{ 417e7b4141af4142affe41437e7d5a },
+ "A\u4eae\\xAF\u8c05\\xAF\\xFE\u64a9Z",
+ :intvector{ 0,3,5,5,5,5,6,8,8,8,8,8,8,8,8,10,14 },
+ :int{1}, :int{0}, "", "&C", :bin{""}
+ }
// test that HZ limits its byte values to lead bytes 21..7d and trail bytes 21..7e
{
"HZ",
:bin{ 7e7b21212120217e217f772100007e217e7d207e7e807e0a2b },
- "\u3000\ufffd\u3013\ufffd\u9ccc\ufffd\ufffd ~\ufffd+",
- :intvector{ 2,4,6,8,10,12,14,18,19,21,24 },
+ "\u3000\ufffd\ufffd\u3013\ufffd\ufffd\u9ccc\ufffd\ufffd ~\ufffd+",
+ :intvector{ 2,4,5,6,8,9,10,12,14,18,19,21,24 },
:int{1}, :int{1}, "", "?", :bin{""}
}
// improve coverage of ISO-2022-JP converter with hardcoded JIS X 0201 and
@@ -61,8 +132,8 @@
{
"ISO-2022-JP",
:bin{ 1b284a7d7e801b2442306c20217f7e21202160217f22202225227f5f211b2842 },
- "}\u203e\ufffd\u4e00\ufffd\ufffd\ufffd\xf7\ufffd\ufffd\u25b2\ufffd\u6f3e",
- :intvector{ 3,4,5,9,11,13,15,17,19,21,23,25,27 },
+ "}\u203e\ufffd\u4e00\ufffd\ufffd\ufffd\ufffd\ufffd\xf7\ufffd\ufffd\ufffd\ufffd\u25b2\ufffd\ufffd\u6f3e",
+ :intvector{ 3,4,5,9,11,12,13,14,16,17,19,20,21,22,23,25,26,27 },
:int{1}, :int{1}, "", "?", :bin{""}
}
// improve coverage of unrolled loops in ucnvmbcs.c/ucnv_MBCSSingleToBMPWithOffsets()
icu.icu5797.backport.patch:
--- NEW FILE icu.icu5797.backport.patch ---
diff -ru icu.orig/source/common/ucnv2022.c icu/source/common/ucnv2022.c
--- icu.orig/source/common/ucnv2022.c 2009-06-11 12:21:48.000000000 +0100
+++ icu/source/common/ucnv2022.c 2009-06-11 12:24:48.000000000 +0100
@@ -472,8 +472,7 @@
if(jpCharsetMasks[version]&CSM(ISO8859_7)) {
myConverterData->myConverterArray[ISO8859_7]= ucnv_loadSharedData("ISO8859_7", NULL, errorCode);
}
- myConverterData->myConverterArray[JISX201] = ucnv_loadSharedData("JISX0201", NULL, errorCode);
- myConverterData->myConverterArray[JISX208] = ucnv_loadSharedData("jisx-208", NULL, errorCode);
+ myConverterData->myConverterArray[JISX208] = ucnv_loadSharedData("Shift-JIS", NULL, errorCode);
if(jpCharsetMasks[version]&CSM(JISX212)) {
myConverterData->myConverterArray[JISX212] = ucnv_loadSharedData("jisx-212", NULL, errorCode);
}
@@ -1040,14 +1039,6 @@
length=3;
}
}
- /*
- * TODO(markus): Use Shift-JIS table for JIS X 0208, to save mapping table space.
- * Pass in parameter for type of output bytes, for validation and shifting:
- * - Direct: Pass bytes through, but forbid control codes 00-1F (except SI/SO/ESC) and space 20?
- * (Need to allow some (TAB/LF/CR) or most of them for ASCII and maybe JIS X 0201.)
- * - A1-FE: Subtract 80 after range check.
- * - SJIS: Shift DBCS result to 21-7E x 21-7E.
- */
/* is this code point assigned, or do we use fallbacks? */
if((stage2Entry&(1<<(16+(c&0xf))))!=0) {
/* assigned */
@@ -1105,6 +1096,23 @@
}
}
+/*
+ * Check that the result is a 2-byte value with each byte in the range A1..FE
+ * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte
+ * to move it to the ISO 2022 range 21..7E.
+ * Return 0 if out of range.
+ */
+static U_INLINE uint32_t
+_2022FromGR94DBCS(uint32_t value) {
+ if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) &&
+ (uint8_t)(value - 0xa1) <= (0xfe - 0xa1)
+ ) {
+ return value - 0x8080; /* shift down to 21..7e byte range */
+ } else {
+ return 0; /* not valid for ISO 2022 */
+ }
+}
+
#ifdef U_ENABLE_GENERIC_ISO_2022
/**********************************************************************************
@@ -1233,7 +1241,7 @@
}
else{
cnv->toUBytes[0] =(char) sourceChar;
- cnv->toULength = 2;
+ cnv->toULength = 1;
}
if(targetUniChar == (missingCharMarker-1/*0xfffe*/)){
@@ -1344,6 +1352,181 @@
* TODO: Implement a priority technique where the users are allowed to set the priority of code pages
*/
+/* Map 00..7F to Unicode according to JIS X 0201. */
+static U_INLINE uint32_t
+jisx201ToU(uint32_t value) {
+ if(value < 0x5c) {
+ return value;
+ } else if(value == 0x5c) {
+ return 0xa5;
+ } else if(value == 0x7e) {
+ return 0x203e;
+ } else /* value <= 0x7f */ {
+ return value;
+ }
+}
+
+/* Map Unicode to 00..7F according to JIS X 0201. Return U+FFFE if unmappable. */
+static U_INLINE uint32_t
+jisx201FromU(uint32_t value) {
+ if(value<=0x7f) {
+ if(value!=0x5c && value!=0x7e) {
+ return value;
+ }
+ } else if(value==0xa5) {
+ return 0x5c;
+ } else if(value==0x203e) {
+ return 0x7e;
+ }
+ return 0xfffe;
+}
+
+/*
+ * Take a valid Shift-JIS byte pair, check that it is in the range corresponding
+ * to JIS X 0208, and convert it to a pair of 21..7E bytes.
+ * Return 0 if the byte pair is out of range.
+ */
+static U_INLINE uint32_t
+_2022FromSJIS(uint32_t value) {
+ uint8_t trail;
+
+ if(value > 0xEFFC) {
+ return 0; /* beyond JIS X 0208 */
+ }
+
+ trail = (uint8_t)value;
+
+ value &= 0xff00; /* lead byte */
+ if(value <= 0x9f00) {
+ value -= 0x7000;
+ } else /* 0xe000 <= value <= 0xef00 */ {
+ value -= 0xb000;
+ }
+ value <<= 1;
+
+ if(trail <= 0x9e) {
+ value -= 0x100;
+ if(trail <= 0x7e) {
+ value |= trail - 0x1f;
+ } else {
+ value |= trail - 0x20;
+ }
+ } else /* trail <= 0xfc */ {
+ value |= trail - 0x7e;
+ }
+ return value;
+}
+
+/*
+ * Convert a pair of JIS X 0208 21..7E bytes to Shift-JIS.
+ * If either byte is outside 21..7E make sure that the result is not valid
+ * for Shift-JIS so that the converter catches it.
+ * Some invalid byte values already turn into equally invalid Shift-JIS
+ * byte values and need not be tested explicitly.
+ */
+static U_INLINE void
+_2022ToSJIS(uint8_t c1, uint8_t c2, char bytes[2]) {
+ if(c1&1) {
+ ++c1;
+ if(c2 <= 0x5f) {
+ c2 += 0x1f;
+ } else if(c2 <= 0x7e) {
+ c2 += 0x20;
+ } else {
+ c2 = 0; /* invalid */
+ }
+ } else {
+ if((uint8_t)(c2-0x21) <= ((0x7e)-0x21)) {
+ c2 += 0x7e;
+ } else {
+ c2 = 0; /* invalid */
+ }
+ }
+ c1 >>= 1;
+ if(c1 <= 0x2f) {
+ c1 += 0x70;
+ } else if(c1 <= 0x3f) {
+ c1 += 0xb0;
+ } else {
+ c1 = 0; /* invalid */
+ }
+ bytes[0] = (char)c1;
+ bytes[1] = (char)c2;
+}
+
+/*
+ * JIS X 0208 has fallbacks from Unicode half-width Katakana to full-width (DBCS)
+ * Katakana.
+ * Now that we use a Shift-JIS table for JIS X 0208 we need to hardcode these fallbacks
+ * because Shift-JIS roundtrips half-width Katakana to single bytes.
+ * These were the only fallbacks in ICU's jisx-208.ucm file.
+ */
+static const uint16_t hwkana_fb[HWKANA_END - HWKANA_START + 1] = {
+ 0x2123, /* U+FF61 */
+ 0x2156,
+ 0x2157,
+ 0x2122,
+ 0x2126,
+ 0x2572,
+ 0x2521,
+ 0x2523,
+ 0x2525,
+ 0x2527,
+ 0x2529,
+ 0x2563,
+ 0x2565,
+ 0x2567,
+ 0x2543,
+ 0x213C, /* U+FF70 */
+ 0x2522,
+ 0x2524,
+ 0x2526,
+ 0x2528,
+ 0x252A,
+ 0x252B,
+ 0x252D,
+ 0x252F,
+ 0x2531,
+ 0x2533,
+ 0x2535,
+ 0x2537,
+ 0x2539,
+ 0x253B,
+ 0x253D,
+ 0x253F, /* U+FF80 */
+ 0x2541,
+ 0x2544,
+ 0x2546,
+ 0x2548,
+ 0x254A,
+ 0x254B,
+ 0x254C,
+ 0x254D,
+ 0x254E,
+ 0x254F,
+ 0x2552,
+ 0x2555,
+ 0x2558,
+ 0x255B,
+ 0x255E,
+ 0x255F, /* U+FF90 */
+ 0x2560,
+ 0x2561,
+ 0x2562,
+ 0x2564,
+ 0x2566,
+ 0x2568,
+ 0x2569,
+ 0x256A,
+ 0x256B,
+ 0x256C,
+ 0x256D,
+ 0x256F,
+ 0x2573,
+ 0x212B,
+ 0x212C /* U+FF9F */
+};
+
static void
UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err) {
UConverter *cnv = args->converter;
@@ -1499,7 +1682,7 @@
}
break;
case HWKANA_7BIT:
- if((uint32_t)(HWKANA_END-sourceChar)<=(HWKANA_END-HWKANA_START)) {
+ if((uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
if(converterData->version==3) {
/* JIS7: use G1 (SO) */
/* Shift U+FF61..U+FF9F to bytes 21..5F. */
@@ -1526,13 +1709,34 @@
break;
case JISX201:
/* G0 SBCS */
- len2 = MBCS_SINGLE_FROM_UCHAR32(
+ value = jisx201FromU(sourceChar);
+ if(value <= 0x7f) {
+ targetValue = value;
+ len = 1;
+ cs = cs0;
+ g = 0;
+ useFallback = FALSE;
+ }
+ break;
+ case JISX208:
+ /* G0 DBCS from Shift-JIS table */
+ len2 = MBCS_FROM_UCHAR32_ISO2022(
converterData->myConverterArray[cs0],
sourceChar, &value,
- useFallback);
- if(len2 != 0 && !(len2 < 0 && len != 0) && value <= 0x7f) {
- targetValue = value;
- len = len2;
+ useFallback, MBCS_OUTPUT_2);
+ if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */
+ value = _2022FromSJIS(value);
+ if(value != 0) {
+ targetValue = value;
+ len = len2;
+ cs = cs0;
+ g = 0;
+ useFallback = FALSE;
+ }
+ } else if(len == 0 && useFallback &&
+ (uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
+ targetValue = hwkana_fb[sourceChar - HWKANA_START];
+ len = -2;
cs = cs0;
g = 0;
useFallback = FALSE;
@@ -1564,17 +1768,10 @@
* Check for valid bytes for the encoding scheme.
* This is necessary because the sub-converter (windows-949)
* has a broader encoding scheme than is valid for 2022.
- *
- * Check that the result is a 2-byte value with each byte in the range A1..FE
- * (strict EUC-KR DBCS) before accepting it and subtracting 0x80 from each byte
- * to move it to the ISO 2022 range 21..7E.
*/
- if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) &&
- (uint8_t)(value - 0xa1) <= (0xfe - 0xa1)
- ) {
- value -= 0x8080; /* shift down to 21..7e byte range */
- } else {
- break; /* not valid for ISO 2022 */
+ value = _2022FromGR94DBCS(value);
+ if(value == 0) {
+ break;
}
}
targetValue = value;
@@ -1750,7 +1947,7 @@
static void
UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
UErrorCode* err){
- char tempBuf[3];
+ char tempBuf[2];
const char *mySource = (char *) args->source;
UChar *myTarget = args->target;
const char *mySourceLimit = args->sourceLimit;
@@ -1868,10 +2065,7 @@
break;
case JISX201:
if(mySourceChar <= 0x7f) {
- targetUniChar =
- _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(
- myData->myConverterArray[cs],
- mySourceChar);
+ targetUniChar = jisx201ToU(mySourceChar);
}
break;
case HWKANA_7BIT:
@@ -1885,8 +2079,13 @@
if(mySource < mySourceLimit) {
char trailByte;
getTrailByte:
- tempBuf[0] = (char) (mySourceChar);
- tempBuf[1] = trailByte = *mySource++;
+ trailByte = *mySource++;
+ if(cs == JISX208) {
+ _2022ToSJIS((uint8_t)mySourceChar, (uint8_t)trailByte, tempBuf);
+ } else {
+ tempBuf[0] = (char)mySourceChar;
+ tempBuf[1] = trailByte;
+ }
mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte);
targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE);
} else {
@@ -3190,6 +3389,9 @@
/* open a set and initialize it with code points that are algorithmically round-tripped */
switch(cnvData->locale[0]){
case 'j':
+ /* include JIS X 0201 which is hardcoded */
+ sa->add(sa->set, 0xa5);
+ sa->add(sa->set, 0x203e);
if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) {
/* include Latin-1 for some variants of JP */
sa->addRange(sa->set, 0, 0xff);
@@ -3198,6 +3400,11 @@
sa->addRange(sa->set, 0, 0x7f);
}
if(jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT)) {
+ /*
+ * TODO(markus): If and when ucnv_getUnicodeSet() supports fallbacks,
+ * we need to include half-width Katakana for all JP variants because
+ * JIS X 0208 has hardcoded fallbacks for them.
+ */
/* include half-width Katakana for JP */
sa->addRange(sa->set, HWKANA_START, HWKANA_END);
}
@@ -3217,15 +3424,7 @@
break;
}
- /*
- * Version-specific for CN:
- * CN version 0 does not map CNS planes 3..7 although
- * they are all available in the CNS conversion table;
- * CN version 1 does map them all.
- * The two versions create different Unicode sets.
- */
- for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
- if(cnvData->myConverterArray[i]!=NULL) {
+#if 0 /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */
if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
cnvData->version==0 && i==CNS_11643
) {
@@ -3235,9 +3434,33 @@
sa, UCNV_ROUNDTRIP_SET,
0, 0x81, 0x82,
pErrorCode);
+ }
+#endif
+
+ for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
+ UConverterSetFilter filter;
+ if(cnvData->myConverterArray[i]!=NULL) {
+ if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
+ cnvData->version==0 && i==CNS_11643
+ ) {
+ /*
+ * Version-specific for CN:
+ * CN version 0 does not map CNS planes 3..7 although
+ * they are all available in the CNS conversion table;
+ * CN version 1 (-EXT) does map them all.
+ * The two versions create different Unicode sets.
+ */
+ filter=UCNV_SET_FILTER_2022_CN;
+ } else if(cnvData->locale[0]=='j' && i==JISX208) {
+ /*
+ * Only add code points that map to Shift-JIS codes
+ * corresponding to JIS X 0208.
+ */
+ filter=UCNV_SET_FILTER_SJIS;
} else {
- ucnv_MBCSGetUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, pErrorCode);
+ filter=UCNV_SET_FILTER_NONE;
}
+ ucnv_MBCSGetFilteredUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, filter, pErrorCode);
}
}
diff -ru icu.orig/source/common/ucnvmbcs.c icu/source/common/ucnvmbcs.c
--- icu.orig/source/common/ucnvmbcs.c 2009-06-11 12:21:48.000000000 +0100
+++ icu/source/common/ucnvmbcs.c 2009-06-11 12:22:56.000000000 +0100
@@ -362,6 +362,8 @@
/* Miscellaneous ------------------------------------------------------------ */
+#if 0 /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */
+
/* similar to ucnv_MBCSGetNextUChar() but recursive */
static void
_getUnicodeSetForBytes(const UConverterSharedData *sharedData,
@@ -454,11 +456,14 @@
pErrorCode);
}
+#endif
+
U_CFUNC void
-ucnv_MBCSGetUnicodeSetForUnicode(const UConverterSharedData *sharedData,
- const USetAdder *sa,
- UConverterUnicodeSet which,
- UErrorCode *pErrorCode) {
+ucnv_MBCSGetFilteredUnicodeSetForUnicode(const UConverterSharedData *sharedData,
+ const USetAdder *sa,
+ UConverterUnicodeSet which,
+ UConverterSetFilter filter,
+ UErrorCode *pErrorCode) {
const UConverterMBCSTable *mbcsTable;
const uint16_t *table;
@@ -512,50 +517,26 @@
c+=1024; /* empty stage 2 block */
}
}
- } else if(mbcsTable->outputType==MBCS_OUTPUT_DBCS_ONLY) {
- /* ignore single-byte results */
+ } else {
const uint32_t *stage2;
- const uint16_t *stage3, *results;
+ const uint8_t *stage3, *bytes;
+ uint32_t st3Multiplier;
+ uint32_t value;
- results=(const uint16_t *)mbcsTable->fromUnicodeBytes;
-
- for(st1=0; st1<maxStage1; ++st1) {
- st2=table[st1];
- if(st2>(maxStage1>>1)) {
- stage2=(const uint32_t *)table+st2;
- for(st2=0; st2<64; ++st2) {
- if((st3=stage2[st2])!=0) {
- /* read the stage 3 block */
- stage3=results+16*(uint32_t)(uint16_t)st3;
-
- /* get the roundtrip flags for the stage 3 block */
- st3>>=16;
+ bytes=mbcsTable->fromUnicodeBytes;
- /*
- * Add code points for which the roundtrip flag is set.
- * Once we get a set for fallback mappings, we have to check
- * non-roundtrip stage 3 results for whether they are 0.
- * See ucnv_MBCSFromUnicodeWithOffsets() for details.
- *
- * Ignore single-byte results (<0x100).
- */
- do {
- if((st3&1)!=0 && *stage3>=0x100) {
- sa->add(sa->set, c);
- }
- st3>>=1;
- ++stage3;
- } while((++c&0xf)!=0);
- } else {
- c+=16; /* empty stage 3 block */
- }
- }
- } else {
- c+=1024; /* empty stage 2 block */
- }
+ switch(mbcsTable->outputType) {
+ case MBCS_OUTPUT_3:
+ case MBCS_OUTPUT_4_EUC:
+ st3Multiplier=3;
+ break;
+ case MBCS_OUTPUT_4:
+ st3Multiplier=4;
+ break;
+ default:
+ st3Multiplier=2;
+ break;
}
- } else {
- const uint32_t *stage2;
for(st1=0; st1<maxStage1; ++st1) {
st2=table[st1];
@@ -563,6 +544,9 @@
stage2=(const uint32_t *)table+st2;
for(st2=0; st2<64; ++st2) {
if((st3=stage2[st2])!=0) {
+ /* read the stage 3 block */
+ stage3=bytes+st3Multiplier*16*(uint32_t)(uint16_t)st3;
+
/* get the roundtrip flags for the stage 3 block */
st3>>=16;
@@ -572,12 +556,49 @@
* non-roundtrip stage 3 results for whether they are 0.
* See ucnv_MBCSFromUnicodeWithOffsets() for details.
*/
- do {
- if(st3&1) {
- sa->add(sa->set, c);
- }
- st3>>=1;
- } while((++c&0xf)!=0);
+ switch(filter) {
+ case UCNV_SET_FILTER_NONE:
+ do {
+ if(st3&1) {
+ sa->add(sa->set, c);
+ }
+ st3>>=1;
+ } while((++c&0xf)!=0);
+ break;
+ case UCNV_SET_FILTER_DBCS_ONLY:
+ /* Ignore single-byte results (<0x100). */
+ do {
+ if((st3&1)!=0 && *((const uint16_t *)stage3)>=0x100) {
+ sa->add(sa->set, c);
+ }
+ st3>>=1;
+ stage3+=2; /* +=st3Multiplier */
+ } while((++c&0xf)!=0);
+ break;
+ case UCNV_SET_FILTER_2022_CN:
+ /* Only add code points that map to CNS 11643 planes 1 & 2 for non-EXT ISO-2022-CN. */
+ do {
+ if((st3&1)!=0 && ((value=*stage3)==0x81 || value==0x82)) {
+ sa->add(sa->set, c);
+ }
+ st3>>=1;
+ stage3+=3; /* +=st3Multiplier */
+ } while((++c&0xf)!=0);
+ break;
+ case UCNV_SET_FILTER_SJIS:
+ /* Only add code points that map to Shift-JIS codes corresponding to JIS X 0208. */
+ do {
+ if((st3&1)!=0 && (value=*((const uint16_t *)stage3))>=0x8140 && value<=0xeffc) {
+ sa->add(sa->set, c);
+ }
+ st3>>=1;
+ stage3+=2; /* +=st3Multiplier */
+ } while((++c&0xf)!=0);
+ break;
+ default:
+ *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
+ return;
+ }
} else {
c+=16; /* empty stage 3 block */
}
@@ -591,6 +612,19 @@
ucnv_extGetUnicodeSet(sharedData, sa, which, pErrorCode);
}
+U_CFUNC void
+ucnv_MBCSGetUnicodeSetForUnicode(const UConverterSharedData *sharedData,
+ const USetAdder *sa,
+ UConverterUnicodeSet which,
+ UErrorCode *pErrorCode) {
+ ucnv_MBCSGetFilteredUnicodeSetForUnicode(
+ sharedData, sa, which,
+ sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY ?
+ UCNV_SET_FILTER_DBCS_ONLY :
+ UCNV_SET_FILTER_NONE,
+ pErrorCode);
+}
+
static void
ucnv_MBCSGetUnicodeSet(const UConverter *cnv,
const USetAdder *sa,
diff -ru icu.orig/source/common/ucnvmbcs.h icu/source/common/ucnvmbcs.h
--- icu.orig/source/common/ucnvmbcs.h 2009-06-11 12:21:48.000000000 +0100
+++ icu/source/common/ucnvmbcs.h 2009-06-11 12:22:56.000000000 +0100
@@ -456,6 +456,7 @@
ucnv_MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
UErrorCode *pErrorCode);
+#if 0 /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */
/*
* Internal function returning a UnicodeSet for toUnicode() conversion.
* Currently only used for ISO-2022-CN, and only handles roundtrip mappings.
@@ -470,6 +471,7 @@
UConverterUnicodeSet which,
uint8_t state, int32_t lowByte, int32_t highByte,
UErrorCode *pErrorCode);
+#endif
/*
* Internal function returning a UnicodeSet for toUnicode() conversion.
@@ -481,9 +483,30 @@
*/
U_CFUNC void
ucnv_MBCSGetUnicodeSetForUnicode(const UConverterSharedData *sharedData,
- const USetAdder *sa,
- UConverterUnicodeSet which,
- UErrorCode *pErrorCode);
+ const USetAdder *sa,
+ UConverterUnicodeSet which,
+ UErrorCode *pErrorCode);
+
+typedef enum UConverterSetFilter {
+ UCNV_SET_FILTER_NONE,
+ UCNV_SET_FILTER_DBCS_ONLY,
+ UCNV_SET_FILTER_2022_CN,
+ UCNV_SET_FILTER_SJIS,
+ UCNV_SET_FILTER_COUNT
+} UConverterSetFilter;
+
+/*
+ * Same as ucnv_MBCSGetUnicodeSetForUnicode() but
+ * the set can be filtered by encoding scheme.
+ * Used by stateful converters which share regular conversion tables
+ * but only use a subset of their mappings.
+ */
+U_CFUNC void
+ucnv_MBCSGetFilteredUnicodeSetForUnicode(const UConverterSharedData *sharedData,
+ const USetAdder *sa,
+ UConverterUnicodeSet which,
+ UConverterSetFilter filter,
+ UErrorCode *pErrorCode);
#endif
diff -ru icu.orig/source/test/cintltst/nucnvtst.c icu/source/test/cintltst/nucnvtst.c
--- icu.orig/source/test/cintltst/nucnvtst.c 2009-06-11 12:21:47.000000000 +0100
+++ icu/source/test/cintltst/nucnvtst.c 2009-06-11 12:23:19.000000000 +0100
@@ -3202,7 +3202,7 @@
0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004A, 0x000D, 0x000A,
0x004B, 0x004C, 0x004D, 0x004E, 0x004F, 0x0050, 0x0051, 0x0052, 0x000D, 0x000A,
0x3005, 0x3006, 0x3007, 0x30FC, 0x2015, 0x2010, 0xFF0F, 0x005C, 0x000D, 0x000A,
- 0x301C, 0x2016, 0x2026, 0x2025, 0x2018, 0x2019, 0x201C, 0x000D, 0x000A,
+ 0x3013, 0x2018, 0x2026, 0x2025, 0x2018, 0x2019, 0x201C, 0x000D, 0x000A,
0x201D, 0x3014, 0x000D, 0x000A,
0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005A, 0x000D, 0x000A,
0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005A, 0x000D, 0x000A,
@@ -3730,7 +3730,7 @@
0x52C8, 0x52CC, 0x52CF, 0x52D1, 0x52D4, 0x52D6, 0x52DB, 0x52DC, 0x000D, 0x000A,
0x004B, 0x004C, 0x004D, 0x004E, 0x004F, 0x0050, 0x0051, 0x0052, 0x000D, 0x000A,
0x3005, 0x3006, 0x3007, 0x30FC, 0x2015, 0x2010, 0xFF0F, 0x005C, 0x000D, 0x000A,
- 0x301C, 0x2016, 0x2026, 0x2025, 0x2018, 0x2019, 0x201C, 0x000D, 0x000A,
+ 0x3013, 0x2018, 0x2026, 0x2025, 0x2018, 0x2019, 0x201C, 0x000D, 0x000A,
0x201D, 0x000D, 0x000A,
0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005A, 0x000D, 0x000A,
0x4F94, 0x4F97, 0x52BA, 0x52BB, 0x52BD, 0x52C0, 0x52C4, 0x52C6, 0x000D, 0x000A,
diff -ru icu.orig/source/test/cintltst/udatatst.c icu/source/test/cintltst/udatatst.c
--- icu.orig/source/test/cintltst/udatatst.c 2009-06-11 12:21:47.000000000 +0100
+++ icu/source/test/cintltst/udatatst.c 2009-06-11 12:23:19.000000000 +0100
@@ -1281,7 +1281,7 @@
* MBCS conversion table file without extension,
* to test swapping and preflighting of UTF-8-friendly mbcsIndex[].
*/
- {"jisx-208", "cnv", ucnv_swap},
+ {"jisx-212", "cnv", ucnv_swap},
#endif
#if !UCONFIG_NO_CONVERSION
diff -ru icu.orig/source/test/testdata/conversion.txt icu/source/test/testdata/conversion.txt
--- icu.orig/source/test/testdata/conversion.txt 2009-06-11 12:21:48.000000000 +0100
+++ icu/source/test/testdata/conversion.txt 2009-06-11 12:22:56.000000000 +0100
@@ -48,6 +48,15 @@
toUnicode {
Headers { "charset", "bytes", "unicode", "offsets", "flush", "fallbacks", "errorCode", "callback", "invalidChars" }
Cases {
+ // improve coverage of ISO-2022-JP converter with hardcoded JIS X 0201 and
+ // using the Shift-JIS table for JIS X 0208 (ticket #5797)
+ {
+ "ISO-2022-JP",
+ :bin{ 1b284a7d7e801b2442306c20217f7e21202160217f22202225227f5f211b2842 },
+ "}\u203e\ufffd\u4e00\ufffd\ufffd\ufffd\xf7\ufffd\ufffd\u25b2\ufffd\u6f3e",
+ :intvector{ 3,4,5,9,11,13,15,17,19,21,23,25,27 },
+ :int{1}, :int{1}, "", "?", :bin{""}
+ }
// improve coverage of unrolled loops in ucnvmbcs.c/ucnv_MBCSSingleToBMPWithOffsets()
{
"ISO-8859-3",
@@ -495,6 +504,15 @@
fromUnicode {
Headers { "charset", "unicode", "bytes", "offsets", "flush", "fallbacks", "errorCode", "callback", "invalidUChars" }
Cases {
+ // improve coverage of ISO-2022-JP converter with hardcoded JIS X 0201 and
+ // using the Shift-JIS table for JIS X 0208 (ticket #5797)
+ {
+ "ISO-2022-JP",
+ "\u203e\xa5\u4e00\ufa10\u6f3e\u0391",
+ :bin{ 1b284a7e5c1b2442306c222e5f2126211b2842 },
+ :intvector{ 0,0,0,0,1,2,2,2,2,2,3,3,4,4,5,5,5,5,5 },
+ :int{1}, :int{0}, "", "?=\u3013", "" // U+3013 Geta Mark converts to 222e
+ }
// Verify that mappings that would result in byte values outside 20..7F (for SBCS)
// or 21..7E (for DBCS) are not used.
// ibm-9005_X110-2007.ucm (ISO 8859-7, <ESC>.F=1b2e46):
@@ -1293,13 +1311,13 @@
// versions of ISO-2022-JP
{
"ISO-2022-JP",
- "[\x00-\x0d\x10-\x1a\x1c-\x7f\u0391-\u03a1\uff61-\uff9f\u4e00\u4e01\uffe5]",
- "[\x0e\x0f\x1b\u0100-\u0113\u0385-\u038a\u4e02\u4e27-\u4e29\uffe6-\U0010ffff]",
+ "[\x00-\x0d\x10-\x1a\x1c-\x7f\xa5\u0391-\u03a1\u203e\uff61-\uff9f\u4e00\u4e01\uffe5]",
+ "[\x0e\x0f\x1b\u0100-\u0113\u0385-\u038a\u4e02\u4e27-\u4e29\ufa0e-\ufa2d\uffe6-\U0010ffff]",
:int{0}
}
{
"ISO-2022-JP-2",
- "[\x00-\x0d\x10-\x1a\x1c-\u0113\u0384-\u0386\u0388-\u038a\u0390-\u03a1\uff61-\uff9f\u4e00-\u4e05\uffe6]",
+ "[\x00-\x0d\x10-\x1a\x1c-\u0113\u0384-\u0386\u0388-\u038a\u0390-\u03a1\u203e\uff61-\uff9f\u4e00-\u4e05\uffe6]",
"[\x0e\x0f\x1b\uffe7-\U0010ffff]",
:int{0}
}
icu.icu6001.backport.patch:
--- NEW FILE icu.icu6001.backport.patch ---
diff -ru icu.icu5797/source/common/ucnv2022.c icu/source/common/ucnv2022.c
--- icu.icu5797/source/common/ucnv2022.c 2009-06-11 12:26:42.000000000 +0100
+++ icu/source/common/ucnv2022.c 2009-06-11 13:21:14.000000000 +0100
@@ -3399,11 +3399,19 @@
/* include ASCII for JP */
sa->addRange(sa->set, 0, 0x7f);
}
- if(jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT)) {
+ if(cnvData->version==3 || cnvData->version==4 || which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) {
/*
- * TODO(markus): If and when ucnv_getUnicodeSet() supports fallbacks,
- * we need to include half-width Katakana for all JP variants because
- * JIS X 0208 has hardcoded fallbacks for them.
+ * Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))!=0
+ * because the bit is on for all JP versions although only versions 3 & 4 (JIS7 & JIS8)
+ * use half-width Katakana.
+ * This is because all ISO-2022-JP variants are lenient in that they accept (in toUnicode)
+ * half-width Katakana via the ESC ( I sequence.
+ * However, we only emit (fromUnicode) half-width Katakana according to the
+ * definition of each variant.
+ *
+ * When including fallbacks,
+ * we need to include half-width Katakana Unicode code points for all JP variants because
+ * JIS X 0208 has hardcoded fallbacks for them (which map to full-width Katakana).
*/
/* include half-width Katakana for JP */
sa->addRange(sa->set, HWKANA_START, HWKANA_END);
@@ -3457,6 +3465,12 @@
* corresponding to JIS X 0208.
*/
filter=UCNV_SET_FILTER_SJIS;
+ } else if(i==KSC5601) {
+ /*
+ * Some of the KSC 5601 tables (convrtrs.txt has this aliases on multiple tables)
+ * are broader than GR94.
+ */
+ filter=UCNV_SET_FILTER_GR94DBCS;
} else {
filter=UCNV_SET_FILTER_NONE;
}
@@ -3472,6 +3486,9 @@
sa->remove(sa->set, 0x0e);
sa->remove(sa->set, 0x0f);
sa->remove(sa->set, 0x1b);
+
+ /* ISO 2022 converters do not convert C1 controls either */
+ sa->removeRange(sa->set, 0x80, 0x9f);
}
static const UConverterImpl _ISO2022Impl={
diff -ru icu.icu5797/source/common/ucnv_ext.c icu/source/common/ucnv_ext.c
--- icu.icu5797/source/common/ucnv_ext.c 2009-06-11 12:26:42.000000000 +0100
+++ icu/source/common/ucnv_ext.c 2009-06-11 13:21:14.000000000 +0100
@@ -946,7 +946,7 @@
ucnv_extGetUnicodeSetString(const UConverterSharedData *sharedData,
const int32_t *cx,
const USetAdder *sa,
- UConverterUnicodeSet which,
+ UBool useFallback,
int32_t minLength,
UChar32 c,
UChar s[UCNV_EXT_MAX_UCHARS], int32_t length,
@@ -966,7 +966,7 @@
value=*fromUSectionValues++;
if( value!=0 &&
- UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) &&
+ (UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) || useFallback) &&
UCNV_EXT_FROM_U_GET_LENGTH(value)>=minLength
) {
if(c>=0) {
@@ -987,12 +987,14 @@
/* no mapping, do nothing */
} else if(UCNV_EXT_FROM_U_IS_PARTIAL(value)) {
ucnv_extGetUnicodeSetString(
- sharedData, cx, sa, which, minLength,
+ sharedData, cx, sa, useFallback, minLength,
U_SENTINEL, s, length+1,
(int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value),
pErrorCode);
- } else if(((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_EXT_FROM_U_RESERVED_MASK))==
- UCNV_EXT_FROM_U_ROUNDTRIP_FLAG) &&
+ } else if((useFallback ?
+ (value&UCNV_EXT_FROM_U_RESERVED_MASK)==0 :
+ ((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_EXT_FROM_U_RESERVED_MASK))==
+ UCNV_EXT_FROM_U_ROUNDTRIP_FLAG)) &&
UCNV_EXT_FROM_U_GET_LENGTH(value)>=minLength
) {
sa->addString(sa->set, s, length+1);
@@ -1004,6 +1006,7 @@
ucnv_extGetUnicodeSet(const UConverterSharedData *sharedData,
const USetAdder *sa,
UConverterUnicodeSet which,
+ UConverterSetFilter filter,
UErrorCode *pErrorCode) {
const int32_t *cx;
const uint16_t *stage12, *stage3, *ps2, *ps3;
@@ -1011,6 +1014,7 @@
uint32_t value;
int32_t st1, stage1Length, st2, st3, minLength;
+ UBool useFallback;
UChar s[UCNV_EXT_MAX_UCHARS];
UChar32 c;
@@ -1027,12 +1031,20 @@
stage1Length=cx[UCNV_EXT_FROM_U_STAGE_1_LENGTH];
+ useFallback=(UBool)(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET);
+
/* enumerate the from-Unicode trie table */
c=0; /* keep track of the current code point while enumerating */
- if(sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY) {
+ if( sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY ||
+ filter==UCNV_SET_FILTER_DBCS_ONLY ||
+ filter==UCNV_SET_FILTER_SJIS ||
+ filter==UCNV_SET_FILTER_GR94DBCS
+ ) {
/* DBCS-only, ignore single-byte results */
minLength=2;
+ } else if(filter==UCNV_SET_FILTER_2022_CN) {
+ minLength=3;
} else {
minLength=1;
}
@@ -1064,14 +1076,41 @@
length=0;
U16_APPEND_UNSAFE(s, length, c);
ucnv_extGetUnicodeSetString(
- sharedData, cx, sa, which, minLength,
+ sharedData, cx, sa, useFallback, minLength,
c, s, length,
(int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value),
pErrorCode);
- } else if(((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_EXT_FROM_U_RESERVED_MASK))==
- UCNV_EXT_FROM_U_ROUNDTRIP_FLAG) &&
+ } else if((useFallback ?
+ (value&UCNV_EXT_FROM_U_RESERVED_MASK)==0 :
+ ((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_EXT_FROM_U_RESERVED_MASK))==
+ UCNV_EXT_FROM_U_ROUNDTRIP_FLAG)) &&
UCNV_EXT_FROM_U_GET_LENGTH(value)>=minLength
) {
+ switch(filter) {
+ case UCNV_SET_FILTER_2022_CN:
+ if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==3 && UCNV_EXT_FROM_U_GET_DATA(value)<=0x82ffff)) {
+ continue;
+ }
+ break;
+ case UCNV_SET_FILTER_SJIS:
+ if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==2 && (value=UCNV_EXT_FROM_U_GET_DATA(value))>=0x8140 && value<=0xeffc)) {
+ continue;
+ }
+ break;
+ case UCNV_SET_FILTER_GR94DBCS:
+ if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==2 &&
+ (uint16_t)((value=UCNV_EXT_FROM_U_GET_DATA(value))-0xa1a1)<=(0xfefe-0xa1a1) &&
+ (uint8_t)(value-0xa1)<=(0xfe-0xa1))) {
+ continue;
+ }
+ break;
+ default:
+ /*
+ * UCNV_SET_FILTER_NONE,
+ * or UCNV_SET_FILTER_DBCS_ONLY which is handled via minLength
+ */
+ break;
+ }
sa->add(sa->set, c);
}
} while((++c&0xf)!=0);
diff -ru icu.icu5797/source/common/ucnv_ext.h icu/source/common/ucnv_ext.h
--- icu.icu5797/source/common/ucnv_ext.h 2009-06-11 12:26:42.000000000 +0100
+++ icu/source/common/ucnv_ext.h 2009-06-11 13:21:14.000000000 +0100
@@ -382,10 +382,20 @@
UConverterFromUnicodeArgs *pArgs, int32_t srcIndex,
UErrorCode *pErrorCode);
+/*
+ * Add code points and strings to the set according to the extension mappings.
+ * Limitation on the UConverterSetFilter:
+ * The filters currently assume that they are used with 1:1 mappings.
+ * They only apply to single input code points, and then they pass through
+ * only mappings with single-charset-code results.
+ * For example, the Shift-JIS filter only works for 2-byte results and tests
+ * that those 2 bytes are in the JIS X 0208 range of Shift-JIS.
+ */
U_CFUNC void
ucnv_extGetUnicodeSet(const UConverterSharedData *sharedData,
const USetAdder *sa,
UConverterUnicodeSet which,
+ UConverterSetFilter filter,
UErrorCode *pErrorCode);
/* toUnicode helpers -------------------------------------------------------- */
diff -ru icu.icu5797/source/common/ucnvhz.c icu/source/common/ucnvhz.c
--- icu.icu5797/source/common/ucnvhz.c 2009-06-11 12:26:42.000000000 +0100
+++ icu/source/common/ucnvhz.c 2009-06-11 13:21:14.000000000 +0100
@@ -1,6 +1,6 @@
/*
**********************************************************************
-* Copyright (C) 2000-2006, International Business Machines
+* Copyright (C) 2000-2007, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* file name: ucnvhz.c
@@ -528,6 +528,7 @@
sa->add(sa->set, 0x7e);
/* add all of the code points that the sub-converter handles */
+ /* ucnv_MBCSGetFilteredUnicodeSetForUnicode(((UConverterDataHZ*)cnv->extraInfo)->gbConverter->sharedData, sa, which, UCNV_SET_FILTER_GR94DBCS, pErrorCode); */
((UConverterDataHZ*)cnv->extraInfo)->
gbConverter->sharedData->impl->
getUnicodeSet(((UConverterDataHZ*)cnv->extraInfo)->gbConverter,
diff -ru icu.icu5797/source/common/ucnv_lmb.c icu/source/common/ucnv_lmb.c
--- icu.icu5797/source/common/ucnv_lmb.c 2009-06-11 12:26:42.000000000 +0100
+++ icu/source/common/ucnv_lmb.c 2009-06-11 13:21:33.000000000 +0100
@@ -1,6 +1,6 @@
/*
**********************************************************************
-* Copyright (C) 2000-2006, International Business Machines
+* Copyright (C) 2000-2007, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* file name: ucnv_lmb.cpp
@@ -536,7 +536,7 @@
NULL,\
NULL,\
_LMBCSSafeClone,\
- _LMBCSGetUnicodeSet\
+ ucnv_getCompleteUnicodeSet\
};\
static const UConverterStaticData _LMBCSStaticData##n={\
sizeof(UConverterStaticData),\
@@ -662,15 +662,14 @@
return &newLMBCS->cnv;
}
-static void
-_LMBCSGetUnicodeSet(const UConverter *cnv,
- const USetAdder *sa,
- UConverterUnicodeSet which,
- UErrorCode *pErrorCode) {
- /* all but U+F6xx, see LMBCS explanation above (search for F6xx) */
- sa->addRange(sa->set, 0, 0xf5ff);
- sa->addRange(sa->set, 0xf700, 0x10ffff);
-}
+/*
+ * There used to be a _LMBCSGetUnicodeSet() function here (up to svn revision 20117)
+ * which added all code points except for U+F6xx
+ * because those cannot be represented in the Unicode group.
+ * However, it turns out that windows-950 has roundtrips for all of U+F6xx
+ * which means that LMBCS can convert all Unicode code points after all.
+ * We now simply use ucnv_getCompleteUnicodeSet().
+ */
/*
Here's the basic helper function that we use when converting from
diff -ru icu.icu5797/source/common/ucnvmbcs.c icu/source/common/ucnvmbcs.c
--- icu.icu5797/source/common/ucnvmbcs.c 2009-06-11 12:26:42.000000000 +0100
+++ icu/source/common/ucnvmbcs.c 2009-06-11 13:21:14.000000000 +0100
@@ -485,9 +485,23 @@
if(mbcsTable->outputType==MBCS_OUTPUT_1) {
const uint16_t *stage2, *stage3, *results;
+ uint16_t minValue;
results=(const uint16_t *)mbcsTable->fromUnicodeBytes;
+ /*
+ * Set a threshold variable for selecting which mappings to use.
+ * See ucnv_MBCSSingleFromBMPWithOffsets() and
+ * MBCS_SINGLE_RESULT_FROM_U() for details.
+ */
+ if(which==UCNV_ROUNDTRIP_SET) {
+ /* use only roundtrips */
+ minValue=0xf00;
+ } else /* UCNV_ROUNDTRIP_AND_FALLBACK_SET */ {
+ /* use all roundtrip and fallback results */
+ minValue=0x800;
+ }
+
for(st1=0; st1<maxStage1; ++st1) {
st2=table[st1];
if(st2>maxStage1) {
@@ -497,15 +511,8 @@
/* read the stage 3 block */
stage3=results+st3;
- /*
- * Add code points for which the roundtrip flag is set.
- * Once we get a set for fallback mappings, we have to use
- * a threshold variable with a value of 0x800.
- * See ucnv_MBCSSingleFromBMPWithOffsets() and
- * MBCS_SINGLE_RESULT_FROM_U() for details.
- */
do {
- if(*stage3++>=0xf00) {
+ if(*stage3++>=minValue) {
sa->add(sa->set, c);
}
} while((++c&0xf)!=0);
@@ -522,9 +529,12 @@
const uint8_t *stage3, *bytes;
uint32_t st3Multiplier;
uint32_t value;
+ UBool useFallback;
bytes=mbcsTable->fromUnicodeBytes;
+ useFallback=(UBool)(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET);
+
switch(mbcsTable->outputType) {
case MBCS_OUTPUT_3:
case MBCS_OUTPUT_4_EUC:
@@ -551,9 +561,8 @@
st3>>=16;
/*
- * Add code points for which the roundtrip flag is set.
- * Once we get a set for fallback mappings, we have to check
- * non-roundtrip stage 3 results for whether they are 0.
+ * Add code points for which the roundtrip flag is set,
+ * or which map to non-zero bytes if we use fallbacks.
* See ucnv_MBCSFromUnicodeWithOffsets() for details.
*/
switch(filter) {
@@ -561,6 +570,23 @@
do {
if(st3&1) {
sa->add(sa->set, c);
+ stage3+=st3Multiplier;
+ } else if(useFallback) {
+ uint8_t b=0;
+ switch(st3Multiplier) {
+ case 4:
+ b|=*stage3++;
+ case 3:
+ b|=*stage3++;
+ case 2:
+ b|=stage3[0]|stage3[1];
+ stage3+=2;
+ default:
+ break;
+ }
+ if(b!=0) {
+ sa->add(sa->set, c);
+ }
}
st3>>=1;
} while((++c&0xf)!=0);
@@ -568,7 +594,7 @@
case UCNV_SET_FILTER_DBCS_ONLY:
/* Ignore single-byte results (<0x100). */
do {
- if((st3&1)!=0 && *((const uint16_t *)stage3)>=0x100) {
+ if(((st3&1)!=0 || useFallback) && *((const uint16_t *)stage3)>=0x100) {
sa->add(sa->set, c);
}
st3>>=1;
@@ -578,7 +604,7 @@
case UCNV_SET_FILTER_2022_CN:
/* Only add code points that map to CNS 11643 planes 1 & 2 for non-EXT ISO-2022-CN. */
do {
- if((st3&1)!=0 && ((value=*stage3)==0x81 || value==0x82)) {
+ if(((st3&1)!=0 || useFallback) && ((value=*stage3)==0x81 || value==0x82)) {
sa->add(sa->set, c);
}
st3>>=1;
@@ -588,7 +614,20 @@
case UCNV_SET_FILTER_SJIS:
/* Only add code points that map to Shift-JIS codes corresponding to JIS X 0208. */
do {
- if((st3&1)!=0 && (value=*((const uint16_t *)stage3))>=0x8140 && value<=0xeffc) {
+ if(((st3&1)!=0 || useFallback) && (value=*((const uint16_t *)stage3))>=0x8140 && value<=0xeffc) {
+ sa->add(sa->set, c);
+ }
+ st3>>=1;
+ stage3+=2; /* +=st3Multiplier */
+ } while((++c&0xf)!=0);
+ break;
+ case UCNV_SET_FILTER_GR94DBCS:
+ /* Only add code points that map to ISO 2022 GR 94 DBCS codes (each byte A1..FE). */
+ do {
+ if( ((st3&1)!=0 || useFallback) &&
+ (uint16_t)((value=*((const uint16_t *)stage3))-0xa1a1)<=(0xfefe-0xa1a1) &&
+ (uint8_t)(value-0xa1)<=(0xfe-0xa1)
+ ) {
sa->add(sa->set, c);
}
st3>>=1;
@@ -609,7 +648,7 @@
}
}
- ucnv_extGetUnicodeSet(sharedData, sa, which, pErrorCode);
+ ucnv_extGetUnicodeSet(sharedData, sa, which, filter, pErrorCode);
}
U_CFUNC void
diff -ru icu.icu5797/source/common/ucnvmbcs.h icu/source/common/ucnvmbcs.h
--- icu.icu5797/source/common/ucnvmbcs.h 2009-06-11 12:26:42.000000000 +0100
+++ icu/source/common/ucnvmbcs.h 2009-06-11 13:21:14.000000000 +0100
@@ -492,6 +492,7 @@
UCNV_SET_FILTER_DBCS_ONLY,
UCNV_SET_FILTER_2022_CN,
UCNV_SET_FILTER_SJIS,
+ UCNV_SET_FILTER_GR94DBCS,
UCNV_SET_FILTER_COUNT
} UConverterSetFilter;
diff -ru icu.icu5797/source/common/ucnv_set.c icu/source/common/ucnv_set.c
--- icu.icu5797/source/common/ucnv_set.c 2009-06-11 12:26:42.000000000 +0100
+++ icu/source/common/ucnv_set.c 2009-06-11 13:21:14.000000000 +0100
@@ -1,7 +1,7 @@
/*
*******************************************************************************
*
-* Copyright (C) 2003-2005, International Business Machines
+* Copyright (C) 2003-2007, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@@ -52,7 +52,8 @@
uset_add,
uset_addRange,
uset_addString,
- uset_remove
+ uset_remove,
+ uset_removeRange
};
sa.set=setFillIn;
diff -ru icu.icu5797/source/common/unicode/ucnv.h icu/source/common/unicode/ucnv.h
--- icu.icu5797/source/common/unicode/ucnv.h 2009-06-11 12:26:42.000000000 +0100
+++ icu/source/common/unicode/ucnv.h 2009-06-11 13:21:14.000000000 +0100
@@ -870,6 +870,8 @@
typedef enum UConverterUnicodeSet {
/** Select the set of roundtrippable Unicode code points. @stable ICU 2.6 */
UCNV_ROUNDTRIP_SET,
+ /** Select the set of Unicode code points with roundtrip or fallback mappings. @draft ICU 4.0 */
+ UCNV_ROUNDTRIP_AND_FALLBACK_SET,
/** Number of UConverterUnicodeSet selectors. @stable ICU 2.6 */
UCNV_SET_COUNT
} UConverterUnicodeSet;
@@ -878,11 +880,16 @@
/**
* Returns the set of Unicode code points that can be converted by an ICU converter.
*
- * The current implementation returns only one kind of set (UCNV_ROUNDTRIP_SET):
+ * Returns one of several kinds of set:
+ *
+ * 1. UCNV_ROUNDTRIP_SET
+ *
* The set of all Unicode code points that can be roundtrip-converted
- * (converted without any data loss) with the converter.
+ * (converted without any data loss) with the converter (ucnv_fromUnicode()).
* This set will not include code points that have fallback mappings
* or are only the result of reverse fallback mappings.
+ * This set will also not include PUA code points with fallbacks, although
+ * ucnv_fromUnicode() will always uses those mappings despite ucnv_setFallback().
* See UTR #22 "Character Mapping Markup Language"
* at http://www.unicode.org/reports/tr22/
*
@@ -893,6 +900,12 @@
* by comparing its roundtrip set with the set of ExemplarCharacters from
* ICU's locale data or other sources
*
+ * 2. UCNV_ROUNDTRIP_AND_FALLBACK_SET
+ *
+ * The set of all Unicode code points that can be converted with the converter (ucnv_fromUnicode())
+ * when fallbacks are turned on (see ucnv_setFallback()).
+ * This set includes all code points with roundtrips and fallbacks (but not reverse fallbacks).
+ *
* In the future, there may be more UConverterUnicodeSet choices to select
* sets with different properties.
*
diff -ru icu.icu5797/source/common/uset_imp.h icu/source/common/uset_imp.h
--- icu.icu5797/source/common/uset_imp.h 2009-06-11 12:26:42.000000000 +0100
+++ icu/source/common/uset_imp.h 2009-06-11 13:21:14.000000000 +0100
@@ -36,6 +36,9 @@
typedef void U_CALLCONV
USetRemove(USet *set, UChar32 c);
+typedef void U_CALLCONV
+USetRemoveRange(USet *set, UChar32 start, UChar32 end);
+
/**
* Interface for adding items to a USet, to keep low-level code from
* statically depending on the USet implementation.
@@ -47,6 +50,7 @@
USetAddRange *addRange;
USetAddString *addString;
USetRemove *remove;
+ USetRemoveRange *removeRange;
};
typedef struct USetAdder USetAdder;
diff -ru icu.icu5797/source/test/intltest/convtest.cpp icu/source/test/intltest/convtest.cpp
--- icu.icu5797/source/test/intltest/convtest.cpp 2009-06-11 12:26:42.000000000 +0100
+++ icu/source/test/intltest/convtest.cpp 2009-06-11 13:21:51.000000000 +0100
@@ -70,6 +70,7 @@
case 0: name="TestToUnicode"; if (exec) TestToUnicode(); break;
case 1: name="TestFromUnicode"; if (exec) TestFromUnicode(); break;
case 2: name="TestGetUnicodeSet"; if (exec) TestGetUnicodeSet(); break;
+ case 3: name="TestGetUnicodeSet2"; if (exec) TestGetUnicodeSet2(); break;
default: name=""; break; //needed to end loop
}
}
@@ -465,6 +466,183 @@
}
}
+U_CDECL_BEGIN
+static void U_CALLCONV
+getUnicodeSetCallback(const void *context,
+ UConverterFromUnicodeArgs *fromUArgs,
+ const UChar* codeUnits,
+ int32_t length,
+ UChar32 codePoint,
+ UConverterCallbackReason reason,
+ UErrorCode *pErrorCode) {
+ if(reason<=UCNV_IRREGULAR) {
+ ((UnicodeSet *)context)->remove(codePoint); // the converter cannot convert this code point
+ *pErrorCode=U_ZERO_ERROR; // skip
+ } // else ignore the reset, close and clone calls.
+}
+U_CDECL_END
+
+// Compare ucnv_getUnicodeSet() with the set of characters that can be converted.
+void
+ConversionTest::TestGetUnicodeSet2() {
+ // Build a string with all code points.
+ UChar32 cpLimit;
+ int32_t s0Length;
+ if(quick) {
+ cpLimit=s0Length=0x10000; // BMP only
+ } else {
+ cpLimit=0x110000;
+ s0Length=0x10000+0x200000; // BMP + surrogate pairs
+ }
+ UChar *s0=new UChar[s0Length];
+ if(s0==NULL) {
+ return;
+ }
+ UChar *s=s0;
+ UChar32 c;
+ UChar c2;
+ // low BMP
+ for(c=0; c<=0xd7ff; ++c) {
+ *s++=(UChar)c;
+ }
+ // trail surrogates
+ for(c=0xdc00; c<=0xdfff; ++c) {
+ *s++=(UChar)c;
+ }
+ // lead surrogates
+ // (after trails so that there is not even one surrogate pair in between)
+ for(c=0xd800; c<=0xdbff; ++c) {
+ *s++=(UChar)c;
+ }
+ // high BMP
+ for(c=0xe000; c<=0xffff; ++c) {
+ *s++=(UChar)c;
+ }
+ // supplementary code points = surrogate pairs
+ if(cpLimit==0x110000) {
+ for(c=0xd800; c<=0xdbff; ++c) {
+ for(c2=0xdc00; c2<=0xdfff; ++c2) {
+ *s++=(UChar)c;
+ *s++=c2;
+ }
+ }
+ }
+
+ static const char *const cnvNames[]={
+ "UTF-8",
+ "UTF-7",
+ "UTF-16",
+ "US-ASCII",
+ "ISO-8859-1",
+ "windows-1252",
+ "Shift-JIS",
+ "ibm-1390", // EBCDIC_STATEFUL table
+ "ibm-16684", // DBCS-only extension table based on EBCDIC_STATEFUL table
+ // "HZ", TODO(markus): known bug, the set incorrectly contains [\u02CA\u02CB\u02D9\u2010\u2013\u2015...]
+ "ISO-2022-JP",
+ "JIS7",
+ "ISO-2022-CN",
+ "ISO-2022-CN-EXT",
+ "LMBCS"
+ };
+ char buffer[1024];
+ int32_t i;
+ for(i=0; i<LENGTHOF(cnvNames); ++i) {
+ UErrorCode errorCode=U_ZERO_ERROR;
+ UConverter *cnv=cnv_open(cnvNames[i], errorCode);
+ if(U_FAILURE(errorCode)) {
+ errln("failed to open converter %s - %s", cnvNames[i], u_errorName(errorCode));
+ continue;
+ }
+ UnicodeSet expected;
+ ucnv_setFromUCallBack(cnv, getUnicodeSetCallback, &expected, NULL, NULL, &errorCode);
+ if(U_FAILURE(errorCode)) {
+ errln("failed to set the callback on converter %s - %s", cnvNames[i], u_errorName(errorCode));
+ ucnv_close(cnv);
+ continue;
+ }
+ UConverterUnicodeSet which;
+ for(which=UCNV_ROUNDTRIP_SET; which<UCNV_SET_COUNT; which=(UConverterUnicodeSet)((int)which+1)) {
+ if(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) {
+ ucnv_setFallback(cnv, TRUE);
+ }
+ expected.add(0, cpLimit-1);
+ s=s0;
+ UBool flush;
+ do {
+ char *t=buffer;
+ flush=(UBool)(s==s0+s0Length);
+ ucnv_fromUnicode(cnv, &t, buffer+sizeof(buffer), (const UChar **)&s, s0+s0Length, NULL, flush, &errorCode);
+ if(U_FAILURE(errorCode)) {
+ if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
+ errorCode=U_ZERO_ERROR;
+ continue;
+ } else {
+ break; // unexpected error, should not occur
+ }
+ }
+ } while(!flush);
+ UnicodeSet set;
+ ucnv_getUnicodeSet(cnv, (USet *)&set, which, &errorCode);
+ if(cpLimit<0x110000) {
+ set.remove(cpLimit, 0x10ffff);
+ }
+ if(which==UCNV_ROUNDTRIP_SET) {
+ // ignore PUA code points because they will be converted even if they
+ // are fallbacks and when other fallbacks are turned off,
+ // but ucnv_getUnicodeSet(UCNV_ROUNDTRIP_SET) delivers true roundtrips
+ expected.remove(0xe000, 0xf8ff);
+ expected.remove(0xf0000, 0xffffd);
+ expected.remove(0x100000, 0x10fffd);
+ set.remove(0xe000, 0xf8ff);
+ set.remove(0xf0000, 0xffffd);
+ set.remove(0x100000, 0x10fffd);
+ }
+ if(set!=expected) {
+ // First try to see if we have different sets because ucnv_getUnicodeSet()
+ // added strings: The above conversion method does not tell us what strings might be convertible.
+ // Remove strings from the set and compare again.
+ // Unfortunately, there are no good, direct set methods for finding out whether there are strings
+ // in the set, nor for enumerating or removing just them.
+ // Intersect all code points with the set. The intersection will not contain strings.
+ UnicodeSet temp(0, 0x10ffff);
+ temp.retainAll(set);
+ set=temp;
+ }
+ if(set!=expected) {
+ UnicodeSet diffSet;
+ UnicodeString out;
+
+ // are there items that must be in the set but are not?
+ (diffSet=expected).removeAll(set);
+ if(!diffSet.isEmpty()) {
+ diffSet.toPattern(out, TRUE);
+ if(out.length()>100) {
+ out.replace(100, 0x7fffffff, ellipsis, LENGTHOF(ellipsis));
+ }
+ errln("error: ucnv_getUnicodeSet(\"%s\") is missing items - which set: %d",
+ cnvNames[i], which);
+ errln(out);
+ }
+
+ // are there items that must not be in the set but are?
+ (diffSet=set).removeAll(expected);
+ if(!diffSet.isEmpty()) {
+ diffSet.toPattern(out, TRUE);
+ if(out.length()>100) {
+ out.replace(100, 0x7fffffff, ellipsis, LENGTHOF(ellipsis));
+ }
+ errln("error: ucnv_getUnicodeSet(\"%s\") contains unexpected items - which set: %d",
+ cnvNames[i], which);
+ errln(out);
+ }
+ }
+ }
+ }
+
+ delete [] s0;
+}
+
// open testdata or ICU data converter ------------------------------------- ***
UConverter *
diff -ru icu.icu5797/source/test/intltest/convtest.h icu/source/test/intltest/convtest.h
--- icu.icu5797/source/test/intltest/convtest.h 2009-06-11 12:26:42.000000000 +0100
+++ icu/source/test/intltest/convtest.h 2009-06-11 13:21:13.000000000 +0100
@@ -72,6 +72,7 @@
void TestToUnicode();
void TestFromUnicode();
void TestGetUnicodeSet();
+ void TestGetUnicodeSet2();
private:
UBool
diff -ru icu.icu5797/source/test/testdata/conversion.txt icu/source/test/testdata/conversion.txt
--- icu.icu5797/source/test/testdata/conversion.txt 2009-06-11 12:26:42.000000000 +0100
+++ icu/source/test/testdata/conversion.txt 2009-06-11 13:21:33.000000000 +0100
@@ -1311,16 +1311,29 @@
// versions of ISO-2022-JP
{
"ISO-2022-JP",
- "[\x00-\x0d\x10-\x1a\x1c-\x7f\xa5\u0391-\u03a1\u203e\uff61-\uff9f\u4e00\u4e01\uffe5]",
- "[\x0e\x0f\x1b\u0100-\u0113\u0385-\u038a\u4e02\u4e27-\u4e29\ufa0e-\ufa2d\uffe6-\U0010ffff]",
+ "[\x00-\x0d\x10-\x1a\x1c-\x7f\xa5\u0391-\u03a1\u2015\u203e\u4e00\u4e01\uffe5]",
+ "[\x0e\x0f\x1b\u0100-\u0113\u0385-\u038a\u2014\u301c\u4e02\u4e27-\u4e29\u4fe0\u663b\u9eb5\ufa0e-\ufa2d\uff61-\uff9f\uffe4\uffe6-\U0010ffff]",
:int{0}
- }
+ }
{
"ISO-2022-JP-2",
- "[\x00-\x0d\x10-\x1a\x1c-\u0113\u0384-\u0386\u0388-\u038a\u0390-\u03a1\u203e\uff61-\uff9f\u4e00-\u4e05\uffe6]",
- "[\x0e\x0f\x1b\uffe7-\U0010ffff]",
+ "[\x00-\x0d\x10-\x1a\x1c-\x7f\xa0-\u0113\u0384-\u0386\u0388-\u038a\u0390-\u03a1\u203e\u4e00-\u4e05\u4fe0\u663b\uffe6]",
+ "[\x0e\x0f\x1b\uff61-\uff9f\uffe4\uffe7-\U0010ffff]",
+ :int{0}
+ }
+ {
+ "JIS7",
+ "[\x00-\x0d\x10-\x1a\x1c-\x7f\xa0-\u0113\u0384-\u0386\u0388-\u038a\u0390-\u03a1\u203e\u4e00-\u4e05\u4fe0\u663b\uff61-\uff9f\uffe6]",
+ "[\x0e\x0f\x1b\uffe4\uffe7-\U0010ffff]",
:int{0}
}
+ // with fallbacks
+ {
+ "ISO-2022-JP",
+ "[\x00-\x0d\x10-\x1a\x1c-\x7f\xa5\u0391-\u03a1\u2014\u2015\u203e\u301c\u4e00\u4e01\u4fe0\u9eb5\uff61-\uff9f\uffe5]",
+ "[\x0e\x0f\x1b\xa6\u0100-\u0113\u0385-\u038a\u4e02\u4e27-\u4e29\u663b\ufa0e-\ufa2d\uffe4\uffe6-\U0010ffff]",
+ :int{1}
+ }
// versions of ISO-2022-CN
{
@@ -1352,6 +1365,14 @@
:int{0}
}
+ // LMBCS
+ {
+ "LMBCS",
+ "[\x00-\U0010ffff]",
+ "[]",
+ :int{0}
+ }
+
// extensions
{
"ibm-1390",
icu.icu6002.backport.patch:
--- NEW FILE icu.icu6002.backport.patch ---
diff -ru icu.6001/source/common/ucnv_ext.c icu/source/common/ucnv_ext.c
--- icu.6001/source/common/ucnv_ext.c 2009-06-11 13:22:41.000000000 +0100
+++ icu/source/common/ucnv_ext.c 2009-06-11 13:30:06.000000000 +0100
@@ -1036,15 +1036,13 @@
/* enumerate the from-Unicode trie table */
c=0; /* keep track of the current code point while enumerating */
- if( sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY ||
- filter==UCNV_SET_FILTER_DBCS_ONLY ||
- filter==UCNV_SET_FILTER_SJIS ||
- filter==UCNV_SET_FILTER_GR94DBCS
+ if(filter==UCNV_SET_FILTER_2022_CN) {
+ minLength=3;
+ } else if( sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY ||
+ filter!=UCNV_SET_FILTER_NONE
) {
/* DBCS-only, ignore single-byte results */
minLength=2;
- } else if(filter==UCNV_SET_FILTER_2022_CN) {
- minLength=3;
} else {
minLength=1;
}
@@ -1099,8 +1097,15 @@
break;
case UCNV_SET_FILTER_GR94DBCS:
if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==2 &&
- (uint16_t)((value=UCNV_EXT_FROM_U_GET_DATA(value))-0xa1a1)<=(0xfefe-0xa1a1) &&
- (uint8_t)(value-0xa1)<=(0xfe-0xa1))) {
+ (uint16_t)((value=UCNV_EXT_FROM_U_GET_DATA(value))-0xa1a1)<=(0xfefe - 0xa1a1) &&
+ (uint8_t)(value-0xa1)<=(0xfe - 0xa1))) {
+ continue;
+ }
+ break;
+ case UCNV_SET_FILTER_HZ:
+ if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==2 &&
+ (uint16_t)((value=UCNV_EXT_FROM_U_GET_DATA(value))-0xa1a1)<=(0xfdfe - 0xa1a1) &&
+ (uint8_t)(value-0xa1)<=(0xfe - 0xa1))) {
continue;
}
break;
diff -ru icu.6001/source/common/ucnvhz.c icu/source/common/ucnvhz.c
--- icu.6001/source/common/ucnvhz.c 2009-06-11 13:22:41.000000000 +0100
+++ icu/source/common/ucnvhz.c 2009-06-11 13:30:03.000000000 +0100
@@ -72,7 +72,7 @@
cnv->extraInfo = uprv_malloc(sizeof(UConverterDataHZ));
if(cnv->extraInfo != NULL){
uprv_memset(cnv->extraInfo, 0, sizeof(UConverterDataHZ));
- ((UConverterDataHZ*)cnv->extraInfo)->gbConverter = ucnv_open("ibm-1386",errorCode);
+ ((UConverterDataHZ*)cnv->extraInfo)->gbConverter = ucnv_open("GBK",errorCode);
}
else {
*errorCode = U_MEMORY_ALLOCATION_ERROR;
@@ -141,7 +141,7 @@
UChar *myTarget = args->target;
const char *mySourceLimit = args->sourceLimit;
UChar32 targetUniChar = 0x0000;
- UChar mySourceChar = 0x0000;
+ int32_t mySourceChar = 0x0000;
UConverterDataHZ* myData=(UConverterDataHZ*)(args->converter->extraInfo);
tempBuf[0]=0;
tempBuf[1]=0;
@@ -156,90 +156,71 @@
mySourceChar= (unsigned char) *mySource++;
- switch(mySourceChar){
+ if(args->converter->mode == UCNV_TILDE) {
+ /* second byte after ~ */
+ args->converter->mode=0;
+ switch(mySourceChar) {
case 0x0A:
- if(args->converter->mode ==UCNV_TILDE){
- args->converter->mode=0;
-
- }
- *(myTarget++)=(UChar)mySourceChar;
+ /* no output for ~\n (line-continuation marker) */
continue;
-
case UCNV_TILDE:
- if(args->converter->mode ==UCNV_TILDE){
- *(myTarget++)=(UChar)mySourceChar;
- args->converter->mode=0;
- continue;
-
+ if(args->offsets) {
+ args->offsets[myTarget - args->target]=(int32_t)(mySource - args->source - 2);
}
- else if(args->converter->toUnicodeStatus !=0){
- args->converter->mode=0;
- break;
- }
- else{
- args->converter->mode = UCNV_TILDE;
- continue;
- }
-
-
+ *(myTarget++)=(UChar)mySourceChar;
+ continue;
case UCNV_OPEN_BRACE:
- if(args->converter->mode == UCNV_TILDE){
- args->converter->mode=0;
- myData->isStateDBCS = TRUE;
- continue;
- }
- else{
- break;
- }
-
-
+ myData->isStateDBCS = TRUE;
+ continue;
case UCNV_CLOSE_BRACE:
- if(args->converter->mode == UCNV_TILDE){
- args->converter->mode=0;
- myData->isStateDBCS = FALSE;
- continue;
- }
- else{
- break;
- }
-
+ myData->isStateDBCS = FALSE;
+ continue;
default:
/* if the first byte is equal to TILDE and the trail byte
* is not a valid byte then it is an error condition
*/
- if(args->converter->mode == UCNV_TILDE){
- args->converter->mode=0;
- mySourceChar= (UChar)(((UCNV_TILDE+0x80) << 8) | ((mySourceChar & 0x00ff)+0x80));
- goto SAVE_STATE;
- }
-
+ mySourceChar = 0x7e00 | mySourceChar;
+ targetUniChar = 0xffff;
break;
-
- }
-
- if(myData->isStateDBCS){
+ }
+ } else if(myData->isStateDBCS) {
if(args->converter->toUnicodeStatus == 0x00){
- args->converter->toUnicodeStatus = (UChar) mySourceChar;
+ /* lead byte */
+ if(mySourceChar == UCNV_TILDE) {
+ args->converter->mode = UCNV_TILDE;
+ } else {
+ /* add another bit to distinguish a 0 byte from not having seen a lead byte */
+ args->converter->toUnicodeStatus = (uint32_t) (mySourceChar | 0x100);
+ }
continue;
}
else{
- tempBuf[0] = (char) (args->converter->toUnicodeStatus+0x80) ;
- tempBuf[1] = (char) (mySourceChar+0x80);
- mySourceChar= (UChar)(((args->converter->toUnicodeStatus+0x80) << 8) | ((mySourceChar & 0x00ff)+0x80));
+ /* trail byte */
+ uint32_t leadByte = args->converter->toUnicodeStatus & 0xff;
+ if( (uint8_t)(leadByte - 0x21) <= (0x7d - 0x21) &&
+ (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21)
+ ) {
+ tempBuf[0] = (char) (leadByte+0x80) ;
+ tempBuf[1] = (char) (mySourceChar+0x80);
+ targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->gbConverter->sharedData,
+ tempBuf, 2, args->converter->useFallback);
+ } else {
+ targetUniChar = 0xffff;
+ }
+ /* add another bit so that the code below writes 2 bytes in case of error */
+ mySourceChar= 0x10000 | (leadByte << 8) | mySourceChar;
args->converter->toUnicodeStatus =0x00;
- targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->gbConverter->sharedData,
- tempBuf, 2, args->converter->useFallback);
}
}
else{
- if(args->converter->fromUnicodeStatus == 0x00){
- targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->gbConverter->sharedData,
- mySource - 1, 1, args->converter->useFallback);
- }
- else{
- goto SAVE_STATE;
+ if(mySourceChar == UCNV_TILDE) {
+ args->converter->mode = UCNV_TILDE;
+ continue;
+ } else if(mySourceChar <= 0x7f) {
+ targetUniChar = (UChar)mySourceChar; /* ASCII */
+ } else {
+ targetUniChar = 0xffff;
}
-
}
if(targetUniChar < 0xfffe){
if(args->offsets) {
@@ -248,26 +229,17 @@
*(myTarget++)=(UChar)targetUniChar;
}
- else if(targetUniChar>=0xfffe){
-SAVE_STATE:
+ else /* targetUniChar>=0xfffe */ {
if(targetUniChar == 0xfffe){
*err = U_INVALID_CHAR_FOUND;
}
else{
*err = U_ILLEGAL_CHAR_FOUND;
}
- if(myData->isStateDBCS){
- /* this should never occur since isStateDBCS is set to true
- * only after tempBuf[0] and tempBuf[1]
- * are set to the input .. just to please BEAM
- */
- if(tempBuf[0]==0 || tempBuf[1]==0){
- *err = U_INTERNAL_PROGRAM_ERROR;
- }else{
- args->converter->toUBytes[0] = (uint8_t)(tempBuf[0]-0x80);
- args->converter->toUBytes[1] = (uint8_t)(tempBuf[1]-0x80);
- args->converter->toULength=2;
- }
+ if(mySourceChar > 0xff){
+ args->converter->toUBytes[0] = (uint8_t)(mySourceChar >> 8);
+ args->converter->toUBytes[1] = (uint8_t)mySourceChar;
+ args->converter->toULength=2;
}
else{
args->converter->toUBytes[0] = (uint8_t)mySourceChar;
@@ -328,16 +300,21 @@
escSeq = TILDE_ESCAPE;
CONCAT_ESCAPE_MACRO(args, myTargetIndex, targetLength, escSeq,err,len,mySourceIndex);
continue;
- }
- else{
+ } else if(mySourceChar <= 0x7f) {
+ length = 1;
+ targetUniChar = mySourceChar;
+ } else {
length= ucnv_MBCSFromUChar32(myConverterData->gbConverter->sharedData,
mySourceChar,&targetUniChar,args->converter->useFallback);
-
- }
- /* only DBCS or SBCS characters are expected*/
- /* DB haracters with high bit set to 1 are expected */
- if(length > 2 || length==0 ||(((targetUniChar & 0x8080) != 0x8080)&& length==2)){
- targetUniChar= missingCharMarker;
+ /* we can only use lead bytes 21..7D and trail bytes 21..7E */
+ if( length == 2 &&
+ (uint16_t)(targetUniChar - 0xa1a1) <= (0xfdfe - 0xa1a1) &&
+ (uint8_t)(targetUniChar - 0xa1) <= (0xfe - 0xa1)
+ ) {
+ targetUniChar -= 0x8080;
+ } else {
+ targetUniChar = missingCharMarker;
+ }
}
if (targetUniChar != missingCharMarker){
myConverterData->isTargetUCharDBCS = isTargetUCharDBCS = (UBool)(targetUniChar>0x00FF);
@@ -360,22 +337,22 @@
if(isTargetUCharDBCS){
if( myTargetIndex <targetLength){
- myTarget[myTargetIndex++] =(char) ((targetUniChar >> 8) -0x80);
+ myTarget[myTargetIndex++] =(char) (targetUniChar >> 8);
if(offsets){
*(offsets++) = mySourceIndex-1;
}
if(myTargetIndex < targetLength){
- myTarget[myTargetIndex++] =(char) ((targetUniChar & 0x00FF) -0x80);
+ myTarget[myTargetIndex++] =(char) targetUniChar;
if(offsets){
*(offsets++) = mySourceIndex-1;
}
}else{
- args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (char) ((targetUniChar & 0x00FF) -0x80);
+ args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (char) targetUniChar;
*err = U_BUFFER_OVERFLOW_ERROR;
}
}else{
- args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] =(char) ((targetUniChar >> 8) -0x80);
- args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (char) ((targetUniChar & 0x00FF) -0x80);
+ args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] =(char) (targetUniChar >> 8);
+ args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (char) targetUniChar;
*err = U_BUFFER_OVERFLOW_ERROR;
}
@@ -524,15 +501,14 @@
const USetAdder *sa,
UConverterUnicodeSet which,
UErrorCode *pErrorCode) {
- /* the tilde '~' is hardcoded in the converter */
- sa->add(sa->set, 0x7e);
+ /* HZ converts all of ASCII */
+ sa->addRange(sa->set, 0, 0x7f);
/* add all of the code points that the sub-converter handles */
- /* ucnv_MBCSGetFilteredUnicodeSetForUnicode(((UConverterDataHZ*)cnv->extraInfo)->gbConverter->sharedData, sa, which, UCNV_SET_FILTER_GR94DBCS, pErrorCode); */
- ((UConverterDataHZ*)cnv->extraInfo)->
- gbConverter->sharedData->impl->
- getUnicodeSet(((UConverterDataHZ*)cnv->extraInfo)->gbConverter,
- sa, which, pErrorCode);
+ ucnv_MBCSGetFilteredUnicodeSetForUnicode(
+ ((UConverterDataHZ*)cnv->extraInfo)->gbConverter->sharedData,
+ sa, which, UCNV_SET_FILTER_HZ,
+ pErrorCode);
}
static const UConverterImpl _HZImpl={
diff -ru icu.6001/source/common/ucnvmbcs.c icu/source/common/ucnvmbcs.c
--- icu.6001/source/common/ucnvmbcs.c 2009-06-11 13:22:41.000000000 +0100
+++ icu/source/common/ucnvmbcs.c 2009-06-11 13:30:06.000000000 +0100
@@ -625,8 +625,21 @@
/* Only add code points that map to ISO 2022 GR 94 DBCS codes (each byte A1..FE). */
do {
if( ((st3&1)!=0 || useFallback) &&
- (uint16_t)((value=*((const uint16_t *)stage3))-0xa1a1)<=(0xfefe-0xa1a1) &&
- (uint8_t)(value-0xa1)<=(0xfe-0xa1)
+ (uint16_t)((value=*((const uint16_t *)stage3)) - 0xa1a1)<=(0xfefe - 0xa1a1) &&
+ (uint8_t)(value-0xa1)<=(0xfe - 0xa1)
+ ) {
+ sa->add(sa->set, c);
+ }
+ st3>>=1;
+ stage3+=2; /* +=st3Multiplier */
+ } while((++c&0xf)!=0);
+ break;
+ case UCNV_SET_FILTER_HZ:
+ /* Only add code points that are suitable for HZ DBCS (lead byte A1..FD). */
+ do {
+ if( ((st3&1)!=0 || useFallback) &&
+ (uint16_t)((value=*((const uint16_t *)stage3))-0xa1a1)<=(0xfdfe - 0xa1a1) &&
+ (uint8_t)(value-0xa1)<=(0xfe - 0xa1)
) {
sa->add(sa->set, c);
}
diff -ru icu.6001/source/common/ucnvmbcs.h icu/source/common/ucnvmbcs.h
--- icu.6001/source/common/ucnvmbcs.h 2009-06-11 13:22:41.000000000 +0100
+++ icu/source/common/ucnvmbcs.h 2009-06-11 13:30:03.000000000 +0100
@@ -493,6 +493,7 @@
UCNV_SET_FILTER_2022_CN,
UCNV_SET_FILTER_SJIS,
UCNV_SET_FILTER_GR94DBCS,
+ UCNV_SET_FILTER_HZ,
UCNV_SET_FILTER_COUNT
} UConverterSetFilter;
diff -ru icu.6001/source/test/cintltst/ncnvtst.c icu/source/test/cintltst/ncnvtst.c
--- icu.6001/source/test/cintltst/ncnvtst.c 2009-06-11 13:22:40.000000000 +0100
+++ icu/source/test/cintltst/ncnvtst.c 2009-06-11 13:30:03.000000000 +0100
@@ -1928,7 +1928,7 @@
#if !UCONFIG_NO_LEGACY_CONVERSION
{ "UTF-8", 0, 0xd7ff, 0xe000, 0x10ffff, 0xd800, 0xdfff },
{ "windows-1251", 0, 0x7f, 0x410, 0x44f, 0x3000, 0xd7ff },
- { "HZ", 0x410, 0x44f, 0x4e00, 0x4eff, 0xac00, 0xd7ff },
+ /* HZ test case fixed and moved to intltest's conversion.txt, ticket #6002 */
{ "shift-jis", 0x3041, 0x3093, 0x30a1, 0x30f3, 0x900, 0x1cff }
#else
{ "UTF-8", 0, 0xd7ff, 0xe000, 0x10ffff, 0xd800, 0xdfff }
diff -ru icu.6001/source/test/intltest/convtest.cpp icu/source/test/intltest/convtest.cpp
--- icu.6001/source/test/intltest/convtest.cpp 2009-06-11 13:22:40.000000000 +0100
+++ icu/source/test/intltest/convtest.cpp 2009-06-11 13:30:03.000000000 +0100
@@ -538,7 +538,7 @@
"Shift-JIS",
"ibm-1390", // EBCDIC_STATEFUL table
"ibm-16684", // DBCS-only extension table based on EBCDIC_STATEFUL table
- // "HZ", TODO(markus): known bug, the set incorrectly contains [\u02CA\u02CB\u02D9\u2010\u2013\u2015...]
+ "HZ",
"ISO-2022-JP",
"JIS7",
"ISO-2022-CN",
diff -ru icu.6001/source/test/testdata/conversion.txt icu/source/test/testdata/conversion.txt
--- icu.6001/source/test/testdata/conversion.txt 2009-06-11 13:22:40.000000000 +0100
+++ icu/source/test/testdata/conversion.txt 2009-06-11 13:30:03.000000000 +0100
@@ -48,6 +48,14 @@
toUnicode {
Headers { "charset", "bytes", "unicode", "offsets", "flush", "fallbacks", "errorCode", "callback", "invalidChars" }
Cases {
+ // test that HZ limits its byte values to lead bytes 21..7d and trail bytes 21..7e
+ {
+ "HZ",
+ :bin{ 7e7b21212120217e217f772100007e217e7d207e7e807e0a2b },
+ "\u3000\ufffd\u3013\ufffd\u9ccc\ufffd\ufffd ~\ufffd+",
+ :intvector{ 2,4,6,8,10,12,14,18,19,21,24 },
+ :int{1}, :int{1}, "", "?", :bin{""}
+ }
// improve coverage of ISO-2022-JP converter with hardcoded JIS X 0201 and
// using the Shift-JIS table for JIS X 0208 (ticket #5797)
{
@@ -1349,6 +1357,14 @@
:int{0}
}
+ // HZ
+ {
+ "HZ",
+ "[\u0410-\u044f\u4e00\u4e01\u4e03]",
+ "[\u4e02\u4e04-\u4e06\uac00-\ud7ff]",
+ :int{0}
+ }
+
// DBCS-only
{
"ibm-971",
Index: icu.spec
===================================================================
RCS file: /cvs/pkgs/rpms/icu/F-9/icu.spec,v
retrieving revision 1.72
retrieving revision 1.73
diff -u -p -r1.72 -r1.73
--- icu.spec 26 Aug 2008 12:59:56 -0000 1.72
+++ icu.spec 11 Jun 2009 19:28:45 -0000 1.73
@@ -1,6 +1,6 @@
Name: icu
Version: 3.8.1
-Release: 8%{?dist}
+Release: 9%{?dist}
Summary: International Components for Unicode
Group: Development/Tools
License: MIT
@@ -18,6 +18,11 @@ Patch6: icu.icu5498.openoffice.org.patch
Patch7: icu.regexp.patch
Patch8: icu.icu6213.worstcase.patch
Patch9: icu.icu6108.malayalam.samvruthokaram.patch
+Patch10: icu.icu5797.backport.patch
+Patch11: icu.icu6001.backport.patch
+Patch12: icu.icu6002.backport.patch
+Patch13: icu.icu6175.emptysegments.patch
+Patch14: icu.icu5691.backport.patch
%description
Tools and utilities for developing with icu.
@@ -64,6 +69,11 @@ Group: Documentation
%patch7 -p0 -b .regexp.patch
%patch8 -p1 -b .icu6213.worstcase.patch
%patch9 -p1 -b .icu6108.malayalam.samvruthokaram.patch
+%patch10 -p1 -b .icu5797.backport.patch
+%patch11 -p1 -b .icu6001.backport.patch
+%patch12 -p1 -b .icu6002.backport.patch
+%patch13 -p1 -b .icu6175.emptysegments.patch
+%patch14 -p1 -b .icu.icu5691.backport.patch
%build
cd source
@@ -141,6 +151,9 @@ rm -rf $RPM_BUILD_ROOT
%doc source/__docs/%{name}/html/*
%changelog
+* Thu Jun 11 2009 Caolan McNamara <caolanm at redhat.com> - 3.8.1-9
+- Resolves: rhbz#505368 CVE-2009-0153 Handle illegal sequences consistently
+
* Tue Aug 26 2008 Caolan McNamara <caolanm at redhat.com> - 3.8.1-8
- Resolves: rhbz#459698 drop Malayalam patches. Note test with
multiple fonts and not just Lohit Malayalam before filing bugs against icu
More information about the fedora-extras-commits
mailing list