rpms/icu/F-10 icu.icu5691.sequences.patch,NONE,1.1
Caolan McNamara
caolanm at fedoraproject.org
Thu Jun 11 16:54:53 UTC 2009
Author: caolanm
Update of /cvs/pkgs/rpms/icu/F-10
In directory cvs1.fedora.phx.redhat.com:/tmp/cvs-serv26323
Added Files:
icu.icu5691.sequences.patch
Log Message:
Resolves: rhbz#505369 CVE-2009-0153 Handle illegal sequences consistently
icu.icu5691.sequences.patch:
--- NEW FILE icu.icu5691.sequences.patch ---
diff -ru icu.orig/source/common/ucnv2022.c icu/source/common/ucnv2022.c
--- icu.orig/source/common/ucnv2022.c 2009-06-11 11:53:51.000000000 +0100
+++ icu/source/common/ucnv2022.c 2009-06-11 12:08:15.000000000 +0100
@@ -754,6 +754,7 @@
UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraInfo);
uint32_t key = myData2022->key;
int32_t offset = 0;
+ int8_t initialToULength = _this->toULength;
char c;
value = VALID_NON_TERMINAL_2022;
@@ -806,7 +807,6 @@
return;
} else if (value == INVALID_2022 ) {
*err = U_ILLEGAL_ESCAPE_SEQUENCE;
- return;
} else /* value == VALID_TERMINAL_2022 */ {
switch(var){
#ifdef U_ENABLE_GENERIC_ISO_2022
@@ -938,6 +938,35 @@
}
if(U_SUCCESS(*err)) {
_this->toULength = 0;
+ } else if(*err==U_ILLEGAL_ESCAPE_SEQUENCE) {
+ if(_this->toULength>1) {
+ /*
+ * Ticket 5691: consistent illegal sequences:
+ * - We include at least the first byte (ESC) in the illegal sequence.
+ * - If any of the non-initial bytes could be the start of a character,
+ * we stop the illegal sequence before the first one of those.
+ * In escape sequences, all following bytes are "printable", that is,
+ * unless they are completely illegal (>7f in SBCS, outside 21..7e in DBCS),
+ * they are valid single/lead bytes.
+ * For simplicity, we always only report the initial ESC byte as the
+ * illegal sequence and back out all other bytes we looked at.
+ */
+ /* Back out some bytes. */
+ int8_t backOutDistance=_this->toULength-1;
+ int8_t bytesFromThisBuffer=_this->toULength-initialToULength;
+ if(backOutDistance<=bytesFromThisBuffer) {
+ /* same as initialToULength<=1 */
+ *source-=backOutDistance;
+ } else {
+ /* Back out bytes from the previous buffer: Need to replay them. */
+ _this->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance);
+ /* same as -(initialToULength-1) */
+ /* preToULength is negative! */
+ uprv_memcpy(_this->preToU, _this->toUBytes+1, -_this->preToULength);
+ *source-=bytesFromThisBuffer;
+ }
+ _this->toULength=1;
+ }
} else if(*err==U_UNSUPPORTED_ESCAPE_SEQUENCE) {
_this->toUCallbackReason = UCNV_UNASSIGNED;
}
@@ -1118,6 +1147,7 @@
}
}
+#if 0 /* 5691: Call sites now check for validity. They can just += 0x8080 after that. */
/*
* This method does the reverse of _2022FromGR94DBCS(). Given the 2022 code point, it returns the
* 2 byte value that is in the range A1..FE for each byte. Otherwise it returns the 2022 code point
@@ -1133,6 +1163,7 @@
return value;
}
}
+#endif
#ifdef U_ENABLE_GENERIC_ISO_2022
@@ -1990,6 +2021,7 @@
mySourceChar = args->converter->toUBytes[0];
args->converter->toULength = 0;
cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
+ targetUniChar = missingCharMarker;
goto getTrailByte;
}
@@ -2119,21 +2151,44 @@
default:
/* G0 DBCS */
if(mySource < mySourceLimit) {
- char trailByte;
+ int leadIsOk, trailIsOk;
+ uint8_t trailByte;
getTrailByte:
- trailByte = *mySource++;
- tmpSourceChar = (mySourceChar << 8) | (uint8_t)(trailByte);
- if(cs == JISX208) {
- _2022ToSJIS((uint8_t)mySourceChar, (uint8_t)trailByte, tempBuf);
- } else {
- if (cs == KSC5601) {
- tmpSourceChar = _2022ToGR94DBCS(tmpSourceChar);
+ trailByte = (uint8_t)*mySource;
+ /*
+ * Ticket 5691: consistent illegal sequences:
+ * - We include at least the first byte in the illegal sequence.
+ * - If any of the non-initial bytes could be the start of a character,
+ * we stop the illegal sequence before the first one of those.
+ *
+ * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
+ * an ESC/SO/SI, we report only the first byte as the illegal sequence.
+ * Otherwise we convert or report the pair of bytes.
+ */
+ leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
+ trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
+ if (leadIsOk && trailIsOk) {
+ ++mySource;
+ tmpSourceChar = (mySourceChar << 8) | trailByte;
+ if(cs == JISX208) {
+ _2022ToSJIS((uint8_t)mySourceChar, trailByte, tempBuf);
+ mySourceChar = tmpSourceChar;
+ } else {
+ /* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */
+ mySourceChar = tmpSourceChar;
+ if (cs == KSC5601) {
+ tmpSourceChar += 0x8080; /* = _2022ToGR94DBCS(tmpSourceChar) */
+ }
+ tempBuf[0] = (char)(tmpSourceChar >> 8);
+ tempBuf[1] = (char)(tmpSourceChar);
}
- tempBuf[0] = (char)(tmpSourceChar >> 8);
- tempBuf[1] = (char)(tmpSourceChar);
+ targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE);
+ } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
+ /* report a pair of illegal bytes if the second byte is not a DBCS starter */
+ ++mySource;
+ /* add another bit so that the code below writes 2 bytes in case of error */
+ mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
}
- targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE);
- mySourceChar = tmpSourceChar;
} else {
args->converter->toUBytes[0] = (uint8_t)mySourceChar;
args->converter->toULength = 1;
@@ -2275,7 +2330,12 @@
}
/* only DBCS or SBCS characters are expected*/
/* DB characters with high bit set to 1 are expected */
- if(length > 2 || length==0 ||(((targetByteUnit & 0x8080) != 0x8080)&& length==2)){
+ if( length > 2 || length==0 ||
+ (length == 1 && targetByteUnit > 0x7f) ||
+ (length == 2 &&
+ ((uint16_t)(targetByteUnit - 0xa1a1) > (0xfefe - 0xa1a1) ||
+ (uint8_t)(targetByteUnit - 0xa1) > (0xfe - 0xa1)))
+ ) {
targetByteUnit=missingCharMarker;
}
if (targetByteUnit != missingCharMarker){
@@ -2604,17 +2664,34 @@
myData->isEmptySegment = FALSE; /* Any invalid char errors will be detected separately, so just reset this */
if(myData->toU2022State.g == 1) {
if(mySource < mySourceLimit) {
- char trailByte;
+ int leadIsOk, trailIsOk;
+ uint8_t trailByte;
getTrailByte:
- trailByte = *mySource++;
- tempBuf[0] = (char)(mySourceChar + 0x80);
- tempBuf[1] = (char)(trailByte + 0x80);
- mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte);
- if((mySourceChar & 0x8080) == 0) {
+ targetUniChar = missingCharMarker;
+ trailByte = (uint8_t)*mySource;
+ /*
+ * Ticket 5691: consistent illegal sequences:
+ * - We include at least the first byte in the illegal sequence.
+ * - If any of the non-initial bytes could be the start of a character,
+ * we stop the illegal sequence before the first one of those.
+ *
+ * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
+ * an ESC/SO/SI, we report only the first byte as the illegal sequence.
+ * Otherwise we convert or report the pair of bytes.
+ */
+ leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
+ trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
+ if (leadIsOk && trailIsOk) {
+ ++mySource;
+ tempBuf[0] = (char)(mySourceChar + 0x80);
+ tempBuf[1] = (char)(trailByte + 0x80);
targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback);
- } else {
- /* illegal bytes > 0x7f */
- targetUniChar = missingCharMarker;
+ mySourceChar = (mySourceChar << 8) | trailByte;
+ } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
+ /* report a pair of illegal bytes if the second byte is not a DBCS starter */
+ ++mySource;
+ /* add another bit so that the code below writes 2 bytes in case of error */
+ mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
}
} else {
args->converter->toUBytes[0] = (uint8_t)mySourceChar;
@@ -2622,8 +2699,10 @@
break;
}
}
- else{
+ else if(mySourceChar <= 0x7f) {
targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySource - 1, 1, useFallback);
+ } else {
+ targetUniChar = 0xffff;
}
if(targetUniChar < 0xfffe){
if(args->offsets) {
@@ -3120,6 +3199,7 @@
/* continue with a partial double-byte character */
mySourceChar = args->converter->toUBytes[0];
args->converter->toULength = 0;
+ targetUniChar = missingCharMarker;
goto getTrailByte;
}
@@ -3199,29 +3279,50 @@
UConverterSharedData *cnv;
StateEnum tempState;
int32_t tempBufLen;
- char trailByte;
+ int leadIsOk, trailIsOk;
+ uint8_t trailByte;
getTrailByte:
- trailByte = *mySource++;
- tempState = (StateEnum)pToU2022State->cs[pToU2022State->g];
- if(tempState >= CNS_11643_0) {
- cnv = myData->myConverterArray[CNS_11643];
- tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0));
- tempBuf[1] = (char) (mySourceChar);
- tempBuf[2] = trailByte;
- tempBufLen = 3;
-
- }else{
- cnv = myData->myConverterArray[tempState];
- tempBuf[0] = (char) (mySourceChar);
- tempBuf[1] = trailByte;
- tempBufLen = 2;
+ trailByte = (uint8_t)*mySource;
+ /*
+ * Ticket 5691: consistent illegal sequences:
+ * - We include at least the first byte in the illegal sequence.
+ * - If any of the non-initial bytes could be the start of a character,
+ * we stop the illegal sequence before the first one of those.
+ *
+ * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
+ * an ESC/SO/SI, we report only the first byte as the illegal sequence.
+ * Otherwise we convert or report the pair of bytes.
+ */
+ leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
+ trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
+ if (leadIsOk && trailIsOk) {
+ ++mySource;
+ tempState = (StateEnum)pToU2022State->cs[pToU2022State->g];
+ if(tempState >= CNS_11643_0) {
+ cnv = myData->myConverterArray[CNS_11643];
+ tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0));
+ tempBuf[1] = (char) (mySourceChar);
+ tempBuf[2] = (char) trailByte;
+ tempBufLen = 3;
+
+ }else{
+ cnv = myData->myConverterArray[tempState];
+ tempBuf[0] = (char) (mySourceChar);
+ tempBuf[1] = (char) trailByte;
+ tempBufLen = 2;
+ }
+ targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, FALSE);
+ mySourceChar = (mySourceChar << 8) | trailByte;
+ } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
+ /* report a pair of illegal bytes if the second byte is not a DBCS starter */
+ ++mySource;
+ /* add another bit so that the code below writes 2 bytes in case of error */
+ mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
}
- mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte);
if(pToU2022State->g>=2) {
/* return from a single-shift state to the previous one */
pToU2022State->g=pToU2022State->prevG;
}
- targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, FALSE);
} else {
args->converter->toUBytes[0] = (uint8_t)mySourceChar;
args->converter->toULength = 1;
diff -ru icu.orig/source/common/ucnvhz.c icu/source/common/ucnvhz.c
--- icu.orig/source/common/ucnvhz.c 2009-06-11 11:53:51.000000000 +0100
+++ icu/source/common/ucnvhz.c 2009-06-11 12:08:15.000000000 +0100
@@ -198,10 +198,30 @@
/* if the first byte is equal to TILDE and the trail byte
* is not a valid byte then it is an error condition
*/
- mySourceChar = 0x7e00 | mySourceChar;
- targetUniChar = 0xffff;
+ /*
+ * Ticket 5691: consistent illegal sequences:
+ * - We include at least the first byte in the illegal sequence.
+ * - If any of the non-initial bytes could be the start of a character,
+ * we stop the illegal sequence before the first one of those.
+ */
myData->isEmptySegment = FALSE; /* different error here, reset this to avoid spurious future error */
- break;
+ *err = U_ILLEGAL_ESCAPE_SEQUENCE;
+ args->converter->toUBytes[0] = UCNV_TILDE;
+ if( myData->isStateDBCS ?
+ (0x21 <= mySourceChar && mySourceChar <= 0x7e) :
+ mySourceChar <= 0x7f
+ ) {
+ /* The current byte could be the start of a character: Back it out. */
+ args->converter->toULength = 1;
+ --mySource;
+ } else {
+ /* Include the current byte in the illegal sequence. */
+ args->converter->toUBytes[1] = mySourceChar;
+ args->converter->toULength = 2;
+ }
+ args->target = myTarget;
+ args->source = mySource;
+ return;
}
} else if(myData->isStateDBCS) {
if(args->converter->toUnicodeStatus == 0x00){
@@ -217,19 +237,36 @@
}
else{
/* trail byte */
+ int leadIsOk, trailIsOk;
uint32_t leadByte = args->converter->toUnicodeStatus & 0xff;
- if( (uint8_t)(leadByte - 0x21) <= (0x7d - 0x21) &&
- (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21)
- ) {
+ targetUniChar = 0xffff;
+ /*
+ * Ticket 5691: consistent illegal sequences:
+ * - We include at least the first byte in the illegal sequence.
+ * - If any of the non-initial bytes could be the start of a character,
+ * we stop the illegal sequence before the first one of those.
+ *
+ * In HZ DBCS, if the second byte is in the 21..7e range,
+ * we report only the first byte as the illegal sequence.
+ * Otherwise we convert or report the pair of bytes.
+ */
+ leadIsOk = (uint8_t)(leadByte - 0x21) <= (0x7d - 0x21);
+ trailIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
+ if (leadIsOk && trailIsOk) {
tempBuf[0] = (char) (leadByte+0x80) ;
tempBuf[1] = (char) (mySourceChar+0x80);
targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->gbConverter->sharedData,
tempBuf, 2, args->converter->useFallback);
+ mySourceChar= (leadByte << 8) | mySourceChar;
+ } else if (trailIsOk) {
+ /* report a single illegal byte and continue with the following DBCS starter byte */
+ --mySource;
+ mySourceChar = (int32_t)leadByte;
} else {
- targetUniChar = 0xffff;
+ /* report a pair of illegal bytes if the second byte is not a DBCS starter */
+ /* add another bit so that the code below writes 2 bytes in case of error */
+ mySourceChar= 0x10000 | (leadByte << 8) | mySourceChar;
}
- /* add another bit so that the code below writes 2 bytes in case of error */
- mySourceChar= 0x10000 | (leadByte << 8) | mySourceChar;
args->converter->toUnicodeStatus =0x00;
}
}
diff -ru icu.orig/source/common/ucnvmbcs.c icu/source/common/ucnvmbcs.c
--- icu.orig/source/common/ucnvmbcs.c 2009-06-11 11:53:51.000000000 +0100
+++ icu/source/common/ucnvmbcs.c 2009-06-11 12:07:14.000000000 +0100
@@ -1,7 +1,7 @@
/*
******************************************************************************
*
-* Copyright (C) 2000-2007, International Business Machines
+* Copyright (C) 2000-2008, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
@@ -2151,6 +2151,65 @@
pArgs->offsets=offsets;
}
+static UBool
+hasValidTrailBytes(const int32_t (*stateTable)[256], uint8_t state) {
+ const int32_t *row=stateTable[state];
+ int32_t b, entry;
+ /* First test for final entries in this state for some commonly valid byte values. */
+ entry=row[0xa1];
+ if( !MBCS_ENTRY_IS_TRANSITION(entry) &&
+ MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL
+ ) {
+ return TRUE;
+ }
+ entry=row[0x41];
+ if( !MBCS_ENTRY_IS_TRANSITION(entry) &&
+ MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL
+ ) {
+ return TRUE;
+ }
+ /* Then test for final entries in this state. */
+ for(b=0; b<=0xff; ++b) {
+ entry=row[b];
+ if( !MBCS_ENTRY_IS_TRANSITION(entry) &&
+ MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL
+ ) {
+ return TRUE;
+ }
+ }
+ /* Then recurse for transition entries. */
+ for(b=0; b<=0xff; ++b) {
+ entry=row[b];
+ if( MBCS_ENTRY_IS_TRANSITION(entry) &&
+ hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry))
+ ) {
+ return TRUE;
+ }
+ }
+ return FALSE;
+}
+
+/*
+ * Is byte b a single/lead byte in this state?
+ * Recurse for transition states, because here we don't want to say that
+ * b is a lead byte if all byte sequences that start with b are illegal.
+ */
+static UBool
+isSingleOrLead(const int32_t (*stateTable)[256], uint8_t state, UBool isDBCSOnly, uint8_t b) {
+ const int32_t *row=stateTable[state];
+ int32_t entry=row[b];
+ if(MBCS_ENTRY_IS_TRANSITION(entry)) { /* lead byte */
+ return hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry));
+ } else {
+ uint8_t action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
+ if(action==MBCS_STATE_CHANGE_ONLY && isDBCSOnly) {
+ return FALSE; /* SI/SO are illegal for DBCS-only conversion */
+ } else {
+ return action!=MBCS_STATE_ILLEGAL;
+ }
+ }
+}
+
U_CFUNC void
ucnv_MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
UErrorCode *pErrorCode) {
@@ -2506,6 +2565,34 @@
sourceIndex=nextSourceIndex;
} else if(U_FAILURE(*pErrorCode)) {
/* callback(illegal) */
+ if(byteIndex>1) {
+ /*
+ * Ticket 5691: consistent illegal sequences:
+ * - We include at least the first byte in the illegal sequence.
+ * - If any of the non-initial bytes could be the start of a character,
+ * we stop the illegal sequence before the first one of those.
+ */
+ UBool isDBCSOnly=(UBool)(cnv->sharedData->mbcs.dbcsOnlyState!=0);
+ int8_t i;
+ for(i=1;
+ i<byteIndex && !isSingleOrLead(stateTable, state, isDBCSOnly, bytes[i]);
+ ++i) {}
+ if(i<byteIndex) {
+ /* Back out some bytes. */
+ int8_t backOutDistance=byteIndex-i;
+ int32_t bytesFromThisBuffer=(int32_t)(source-(const uint8_t *)pArgs->source);
+ byteIndex=i; /* length of reported illegal byte sequence */
+ if(backOutDistance<=bytesFromThisBuffer) {
+ source-=backOutDistance;
+ } else {
+ /* Back out bytes from the previous buffer: Need to replay them. */
+ cnv->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance);
+ /* preToULength is negative! */
+ uprv_memcpy(cnv->preToU, bytes+i, -cnv->preToULength);
+ source=(const uint8_t *)pArgs->source;
+ }
+ }
+ }
break;
} else /* unassigned sequences indicated with byteIndex>0 */ {
/* try an extension mapping */
@@ -2516,7 +2603,7 @@
&offsets, sourceIndex,
pArgs->flush,
pErrorCode);
- sourceIndex=nextSourceIndex+(int32_t)(source-(const uint8_t *)pArgs->source);
+ sourceIndex=nextSourceIndex+=(int32_t)(source-(const uint8_t *)pArgs->source);
if(U_FAILURE(*pErrorCode)) {
/* not mappable or buffer overflow */
@@ -2807,15 +2894,37 @@
if(c<0) {
if(U_SUCCESS(*pErrorCode) && source==sourceLimit && lastSource<source) {
- *pErrorCode=U_TRUNCATED_CHAR_FOUND;
- }
- if(U_FAILURE(*pErrorCode)) {
/* incomplete character byte sequence */
uint8_t *bytes=cnv->toUBytes;
cnv->toULength=(int8_t)(source-lastSource);
do {
*bytes++=*lastSource++;
} while(lastSource<source);
+ *pErrorCode=U_TRUNCATED_CHAR_FOUND;
+ } else if(U_FAILURE(*pErrorCode)) {
+ /* callback(illegal) */
+ /*
+ * Ticket 5691: consistent illegal sequences:
+ * - We include at least the first byte in the illegal sequence.
+ * - If any of the non-initial bytes could be the start of a character,
+ * we stop the illegal sequence before the first one of those.
+ */
+ UBool isDBCSOnly=(UBool)(cnv->sharedData->mbcs.dbcsOnlyState!=0);
+ uint8_t *bytes=cnv->toUBytes;
+ *bytes++=*lastSource++; /* first byte */
+ if(lastSource==source) {
+ cnv->toULength=1;
+ } else /* lastSource<source: multi-byte character */ {
+ int8_t i;
+ for(i=1;
+ lastSource<source && !isSingleOrLead(stateTable, state, isDBCSOnly, *lastSource);
+ ++i
+ ) {
+ *bytes++=*lastSource++;
+ }
+ cnv->toULength=i;
+ source=lastSource;
+ }
} else {
/* no output because of empty input or only state changes */
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
diff -ru icu.orig/source/test/cintltst/nccbtst.c icu/source/test/cintltst/nccbtst.c
--- icu.orig/source/test/cintltst/nccbtst.c 2009-06-11 11:53:49.000000000 +0100
+++ icu/source/test/cintltst/nccbtst.c 2009-06-11 12:06:30.000000000 +0100
@@ -1,6 +1,6 @@
/********************************************************************
* COPYRIGHT:
- * Copyright (c) 1997-2007, International Business Machines Corporation and
+ * Copyright (c) 1997-2008, International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************/
/*
@@ -2530,13 +2530,13 @@
static const uint8_t text943[] = {
- 0x82, 0xa9, 0x82, 0x20, /*0xc8,*/ 0x61, 0x8a, 0xbf, 0x8e, 0x9a };
- static const UChar toUnicode943sub[] = { 0x304b, 0xfffd, /*0xff88,*/ 0x0061, 0x6f22, 0x5b57};
- static const UChar toUnicode943skip[]= { 0x304b, /*0xff88,*/ 0x0061, 0x6f22, 0x5b57};
+ 0x82, 0xa9, 0x82, 0x20, 0x61, 0x8a, 0xbf, 0x8e, 0x9a };
+ static const UChar toUnicode943sub[] = { 0x304b, 0x1a, 0x20, 0x0061, 0x6f22, 0x5b57 };
+ static const UChar toUnicode943skip[]= { 0x304b, 0x20, 0x0061, 0x6f22, 0x5b57 };
static const UChar toUnicode943stop[]= { 0x304b};
- static const int32_t fromIBM943Offssub[] = {0, 2, 4, 5, 7};
- static const int32_t fromIBM943Offsskip[] = { 0, 4, 5, 7};
+ static const int32_t fromIBM943Offssub[] = { 0, 2, 3, 4, 5, 7 };
+ static const int32_t fromIBM943Offsskip[] = { 0, 3, 4, 5, 7 };
static const int32_t fromIBM943Offsstop[] = { 0};
gInBufferSize = inputsize;
@@ -2570,9 +2570,9 @@
{
static const uint8_t sampleText[] = {
0x82, 0xa9, 0x61, 0x62, 0x63 , 0x82,
- 0xff, /*0x82, 0xa9,*/ 0x32, 0x33};
- static const UChar toUnicode943sub[] = {0x304b, 0x0061, 0x0062, 0x0063, 0xfffd,/*0x304b,*/ 0x0032, 0x0033};
- static const int32_t fromIBM943Offssub[] = {0, 2, 3, 4, 5, 7, 8};
+ 0xff, 0x32, 0x33};
+ static const UChar toUnicode943sub[] = { 0x304b, 0x0061, 0x0062, 0x0063, 0x1a, 0x1a, 0x0032, 0x0033 };
+ static const int32_t fromIBM943Offssub[] = { 0, 2, 3, 4, 5, 6, 7, 8 };
/*checking illegal value for ibm-943 with substitute*/
gInBufferSize = inputsize;
gOutBufferSize = outputsize;
diff -ru icu.orig/source/test/cintltst/nucnvtst.c icu/source/test/cintltst/nucnvtst.c
--- icu.orig/source/test/cintltst/nucnvtst.c 2009-06-11 11:53:45.000000000 +0100
+++ icu/source/test/cintltst/nucnvtst.c 2009-06-11 12:06:30.000000000 +0100
@@ -2608,7 +2608,7 @@
TestNextUCharError(cnv, source, source, U_INDEX_OUTOFBOUNDS_ERROR, "sourceLimit <= source");
/*Test for the condition where there is an invalid character*/
{
- static const uint8_t source2[]={0xa1, 0x01};
+ static const uint8_t source2[]={0xa1, 0x80};
TestNextUCharError(cnv, (const char*)source2, (const char*)source2+sizeof(source2), U_ZERO_ERROR, "an invalid character");
}
/*Test for the condition where we have a truncated char*/
@@ -3901,11 +3901,11 @@
TestISO_2022_KR() {
/* test input */
static const uint16_t in[]={
- 0x9F4B,0x9F4E,0x9F52,0x9F5F,0x9F61,0x9F66,0x9F67,0x9F6A,0x000A,0x000D
- ,0x9F6C,0x9F77,0x9F8D,0x9F90,0x9F95,0x9F9C,0xAC00,0xAC01,0xAC02,0xAC04
+ 0x9F4B,0x9F4E,0x9F52,0x9F5F,0x9F61,0x9F67,0x9F6A,0x000A,0x000D
+ ,0x9F6C,0x9F77,0x9F8D,0x9F90,0x9F95,0x9F9C,0xAC00,0xAC01,0xAC04
,0xAC07,0xAC08,0xAC09,0x0025,0x0026,0x0027,0x000A,0x000D,0x0028,0x0029
,0x002A,0x002B,0x002C,0x002D,0x002E,0x53C3,0x53C8,0x53C9,0x53CA,0x53CB
- ,0x53CD,0x53D4,0x53D6,0x53D7,0x53DB,0x000A,0x000D,0x53DF,0x53E1,0x53E2
+ ,0x53CD,0x53D4,0x53D6,0x53D7,0x53DB,0x000A,0x000D,0x53E1,0x53E2
,0x53E3,0x53E4,0x000A,0x000D};
const UChar* uSource;
const UChar* uSourceLimit;
Only in icu/source/test/cintltst: nucnvtst.c.orig
diff -ru icu.orig/source/test/testdata/conversion.txt icu/source/test/testdata/conversion.txt
--- icu.orig/source/test/testdata/conversion.txt 2009-06-11 11:53:49.000000000 +0100
+++ icu/source/test/testdata/conversion.txt 2009-06-11 12:08:35.000000000 +0100
@@ -48,6 +48,138 @@
toUnicode {
Headers { "charset", "bytes", "unicode", "offsets", "flush", "fallbacks", "errorCode", "callback", "invalidChars" }
Cases {
+ // Test ticket 5691: consistent illegal sequences
+ // The following test cases are for illegal character byte sequences.
+ //
+ // Unfortunately, we cannot use the Shift-JIS examples from the ticket
+ // comments because our Shift-JIS table is Windows-compatible and
+ // therefore has no illegal single bytes. Same for GBK.
+ // Instead, we use the stricter GB 18030 also for 2-byte examples.
+ // The byte sequences are generally slightly different from the ticket
+ // comment, simply using assigned characters rather than just
+ // theoretically valid sequences.
+ {
+ "gb18030",
+ :bin{ 618140813c81ff7a },
+ "a\u4e02\\x81<\\x81\\xFFz",
+ :intvector{ 0,1,3,3,3,3,4,5,5,5,5,5,5,5,5,7 },
+ :int{1}, :int{0}, "", "&C", :bin{""}
+ }
+ {
+ "EUC-JP",
+ :bin{ 618fb0a98fb03c8f3cb0a97a },
+ "a\u4e28\\x8F\\xB0<\\x8F<\u9022z",
+ :intvector{ 0,1,4,4,4,4,5,5,5,5,6,7,7,7,7,8,9,11 },
+ :int{1}, :int{0}, "", "&C", :bin{""}
+ }
+ {
+ "gb18030",
+ :bin{ 618130fc318130fc8181303c3e813cfc817a },
+ "a\u05ed\\x810\u9f07\\x810<>\\x81<\u9f07z",
+ :intvector{ 0,1,5,5,5,5,6,7,9,9,9,9,10,11,12,13,13,13,13,14,15,17 },
+ :int{1}, :int{0}, "", "&C", :bin{""}
+ }
+ {
+ "UTF-8",
+ :bin{ 61f1808182f180813cf18081fff180ff3cf1ff3c3e7a },
+ "a\U00040042\\xF1\\x80\\x81<\\xF1\\x80\\x81\\xFF\\xF1\\x80\\xFF<\\xF1\\xFF<>z",
+ :intvector{ 0,1,1,5,5,5,5,5,5,5,5,5,5,5,5,8,9,9,9,9,9,9,9,9,9,9,9,9,12,12,12,12,13,13,13,13,13,13,13,13,15,15,15,15,16,17,17,17,17,18,18,18,18,19,20,21 },
+ :int{1}, :int{0}, "", "&C", :bin{""}
+ }
+ {
+ "ISO-2022-JP",
+ :bin{ 1b24424141af4142affe41431b2842 },
+ "\u758f\\xAF\u758e\\xAF\\xFE\u790e",
+ :intvector{ 3,5,5,5,5,6,8,8,8,8,8,8,8,8,10 },
+ :int{1}, :int{0}, "", "&C", :bin{""}
+ }
+ {
+ "ibm-25546",
+ :bin{ 411b242943420e4141af4142affe41430f5a },
+ "AB\uc88b\\xAF\uc88c\\xAF\\xFE\uc88dZ",
+ :intvector{ 0,5,7,9,9,9,9,10,12,12,12,12,12,12,12,12,14,17 },
+ :int{1}, :int{0}, "", "&C", :bin{""}
+ }
+ {
+ "ISO-2022-KR",
+ :bin{ 411b242943420e4141af4142affe41430f5a },
+ "AB\uc88b\\xAF\uc88c\\xAF\\xFE\uc88dZ",
+ :intvector{ 0,5,7,9,9,9,9,10,12,12,12,12,12,12,12,12,14,17 },
+ :int{1}, :int{0}, "", "&C", :bin{""}
+ }
+ {
+ "ISO-2022-CN",
+ :bin{ 411b242941420e4141af4142affe41430f5a },
+ "AB\u4eae\\xAF\u8c05\\xAF\\xFE\u64a9Z",
+ :intvector{ 0,5,7,9,9,9,9,10,12,12,12,12,12,12,12,12,14,17 },
+ :int{1}, :int{0}, "", "&C", :bin{""}
+ }
+ {
+ "HZ",
+ :bin{ 417e7b4141af4142affe41437e7d5a },
+ "A\u4eae\\xAF\u8c05\\xAF\\xFE\u64a9Z",
+ :intvector{ 0,3,5,5,5,5,6,8,8,8,8,8,8,8,8,10,14 },
+ :int{1}, :int{0}, "", "&C", :bin{""}
+ }
+ // Test ticket 5691: consistent illegal sequences
+ // The following test cases are for illegal escape/designator/shift sequences.
+ //
+ // ISO-2022-JP and -CN with illegal escape sequences.
+ {
+ "ISO-2022-JP",
+ :bin{ 611b24201b244241411b283f1b28427a },
+ "a\\x1B$ \u758f\\x1B\u2538z",
+ :intvector{ 0,1,1,1,1,2,3,7,9,9,9,9,10,15 },
+ :int{1}, :int{0}, "", "&C", :bin{""}
+ }
+ {
+ "ISO-2022-CN",
+ :bin{ 611b2429201b2429410e41410f7a },
+ "a\\x1B$) \u4eaez",
+ :intvector{ 0,1,1,1,1,2,3,4,10,13 },
+ :int{1}, :int{0}, "", "&C", :bin{""}
+ }
+ // Test ticket 5691: ISO-2022-JP-2 with illegal single-shift SS2 and SS3 sequences.
+ // The first ESC N comes before its designator sequence, the last sequence is ESC+space.
+ {
+ "ISO-2022-JP-2",
+ :bin{ 4e1b4e4e1b2e414e1b4e4e4e1b204e },
+ "N\\x1BNNN\xceN\\x1B N",
+ :intvector{ 0,1,1,1,1,2,3,7,10,11,12,12,12,12,13,14 },
+ :int{1}, :int{0}, "", "&C", :bin{""}
+ }
+ {
+ "ISO-2022-CN-EXT",
+ :bin{ 4e1b4e4e1b242a484e1b4e4e4e4e1b204e },
+ "N\\x1BNNN\u8f0eN\\x1B N",
+ :intvector{ 0,1,1,1,1,2,3,8,11,13,14,14,14,14,15,16 },
+ :int{1}, :int{0}, "", "&C", :bin{""}
+ }
+ {
+ "ISO-2022-CN-EXT",
+ :bin{ 4f1b4f4f1b242b494f1b4f4f4f4f1b204f },
+ "O\\x1BOOO\u492bO\\x1B O",
+ :intvector{ 0,1,1,1,1,2,3,8,11,13,14,14,14,14,15,16 },
+ :int{1}, :int{0}, "", "&C", :bin{""}
+ }
+ // Test ticket 5691: HZ with illegal tilde sequences.
+ {
+ "HZ",
+ :bin{ 417e20427e21437e80447e7b41417e207e41427e7f41437e7d5a },
+ "A\\x7E B\\x7E!C\\x7E\\x80D\u4eae\\x7E\\x20\\x7E\u8c05\\x7E\\x7F\u64a9Z",
+ :intvector{ 0,1,1,1,1,2,3,4,4,4,4,5,6,7,7,7,7,7,7,7,7,9, // SBCS
+ 12,14,14,14,14,14,14,14,14,16,16,16,16,17,19,19,19,19,19,19,19,19,21, // DBCS
+ 25 }, // SBCS
+ :int{1}, :int{0}, "", "&C", :bin{""}
+ }
+ // Test ticket 5691: Example from Peter Edberg.
+ {
+ "ISO-2022-JP",
+ :bin{ 1b244230212f7e742630801b284a621b2458631b2842648061 },
+ "\u4e9c\ufffd\u7199\ufffdb\ufffd$Xcd\ufffda",
+ :intvector{ 3,5,7,9,14,15,16,17,18,22,23,24 },
+ :int{1}, :int{0}, "", "?", :bin{""}
+ }
// Test bug 6071 (2:1 Unicode:charset SBCS mapping).
{
"*test1bmp",
@@ -59,9 +191,9 @@
// test that HZ limits its byte values to lead bytes 21..7d and trail bytes 21..7e
{
"HZ",
- :bin{ 7e7b21212120217e217f772100007e217e7d207e7e807e0a2b },
- "\u3000\ufffd\u3013\ufffd\u9ccc\ufffd\ufffd ~\ufffd+",
- :intvector{ 2,4,6,8,10,12,14,18,19,21,24 },
+ :bin{ 7e7b21212120217e217f772100007e217e7e7d207e7e807e0a2b },
+ "\u3000\ufffd\u3013\ufffd\u9ccc\ufffd\ufffd\u3013 ~\ufffd+",
+ :intvector{ 2,4,6,8,10,12,14,15,19,20,22,25 },
:int{1}, :int{1}, "", "?", :bin{""}
}
// improve coverage of ISO-2022-JP converter with hardcoded JIS X 0201 and
@@ -69,8 +201,8 @@
{
"ISO-2022-JP",
:bin{ 1b284a7d7e801b2442306c20217f7e21202160217f22202225227f5f211b2842 },
- "}\u203e\ufffd\u4e00\ufffd\ufffd\ufffd\xf7\ufffd\ufffd\u25b2\ufffd\u6f3e",
- :intvector{ 3,4,5,9,11,13,15,17,19,21,23,25,27 },
+ "}\u203e\ufffd\u4e00\ufffd\ufffd\ufffd\ufffd\xf7\ufffd\ufffd\u25b2\ufffd\u6f3e",
+ :intvector{ 3,4,5,9,11,12,14,16,17,19,21,23,25,27 },
:int{1}, :int{1}, "", "?", :bin{""}
}
// improve coverage of ISO-2022-JP converter by simulating erroneous input
@@ -365,7 +497,7 @@
{
"ISO-2022-CN-EXT",
:bin{ 411b4e2121 }, "\x41", :intvector{ 0 },
- :int{1}, :int{1}, "illesc", ".", :bin{ 1b4e }
+ :int{1}, :int{1}, "illesc", ".", :bin{ 1b }
}
// G3 designator: recognized, but not supported for -CN (only for -CN-EXT)
{
More information about the fedora-extras-commits
mailing list