rpms/icu/F-9 icu.icu5691.backport.patch, NONE, 1.1 icu.icu5797.backport.patch, NONE, 1.1 icu.icu6001.backport.patch, NONE, 1.1 icu.icu6002.backport.patch, NONE, 1.1 icu.spec, 1.72, 1.73

Thu Jun 11 19:29:15 UTC 2009

Author: caolanm

Update of /cvs/pkgs/rpms/icu/F-9
In directory cvs1.fedora.phx.redhat.com:/tmp/cvs-serv30990

Modified Files:
	icu.spec 
Added Files:
	icu.icu5691.backport.patch icu.icu5797.backport.patch 
	icu.icu6001.backport.patch icu.icu6002.backport.patch 
Log Message:
Resolves: rhbz#505368 CVE-2009-0153 Handle illegal sequences consistently

icu.icu5691.backport.patch:

--- NEW FILE icu.icu5691.backport.patch ---
diff -ru icu.6175/source/common/ucnv2022.c icu/source/common/ucnv2022.c
--- icu.6175/source/common/ucnv2022.c	2009-06-11 13:44:44.000000000 +0100
+++ icu/source/common/ucnv2022.c	2009-06-11 18:09:48.000000000 +0100
@@ -1973,6 +1973,7 @@
         mySourceChar = args->converter->toUBytes[0];
         args->converter->toULength = 0;
         cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
+        targetUniChar = missingCharMarker;
         goto getTrailByte;
     }
 
@@ -2102,18 +2103,45 @@
                 default:
                     /* G0 DBCS */
                     if(mySource < mySourceLimit) {
+                        int leadIsOk, trailIsOk;
                         char trailByte;
 getTrailByte:
-                        trailByte = *mySource++;
-                        if(cs == JISX208) {
-                            _2022ToSJIS((uint8_t)mySourceChar, (uint8_t)trailByte, tempBuf);
-                        } else {
-                            tempBuf[0] = (char)mySourceChar;
-                            tempBuf[1] = trailByte;
-                        }
-                        mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte);
-                        targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE);
-                    } else {
+                        trailByte = *mySource;
+                        /*
+                         * Ticket 5691: consistent illegal sequences:
+                         * - We include at least the first byte in the illegal sequence.
+                         * - If any of the non-initial bytes could be the start of a character,
+                         *   we stop the illegal sequence before the first one of those.
+                         *
+                         * In ISO-2022 DBCS, if both bytes are valid or both bytes are outside
+                         * the 21..7e range, then we treat them as a pair.
+                         * Otherwise (valid lead byte + illegal trail byte, or vice versa)
+                         * we report only the first byte as the illegal sequence.
+                         */
+                        leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
+                        trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
+                        if (leadIsOk == trailIsOk) {
+                            ++mySource;
+                            uint32_t tmpSourceChar = (mySourceChar << 8) | (uint8_t)(trailByte);
+                            if (leadIsOk) {
+                                if(cs == JISX208) {
+                                    _2022ToSJIS((uint8_t)mySourceChar, (uint8_t)trailByte, tempBuf);
+                                    mySourceChar = tmpSourceChar;
+                                } else {
+                                    /* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */
+                                    mySourceChar = tmpSourceChar;
+                                    if (cs == KSC5601) {
+                                        tmpSourceChar += 0x8080;  /* = _2022ToGR94DBCS(tmpSourceChar) */
+                                    }
+                                    tempBuf[0] = (char)(tmpSourceChar >> 8);
+                                    tempBuf[1] = (char)(tmpSourceChar);
+                                }
+                                targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE);
+                            } else {
+                                mySourceChar = tmpSourceChar;
+                            }
+                          }
+                      } else {
                         args->converter->toUBytes[0] = (uint8_t)mySourceChar;
                         args->converter->toULength = 1;
                         goto endloop;
@@ -2254,7 +2282,12 @@
             }
             /* only DBCS or SBCS characters are expected*/
             /* DB characters with high bit set to 1 are expected */
-            if(length > 2 || length==0 ||(((targetByteUnit & 0x8080) != 0x8080)&& length==2)){
+            if( length > 2 || length==0 ||
+                (length == 1 && targetByteUnit > 0x7f) ||
+                (length == 2 &&
+                    ((uint16_t)(targetByteUnit - 0xa1a1) > (0xfefe - 0xa1a1) ||
+                    (uint8_t)(targetByteUnit - 0xa1) > (0xfe - 0xa1)))
+            ) {
                 targetByteUnit=missingCharMarker;
             }
             if (targetByteUnit != missingCharMarker){
@@ -2583,17 +2616,36 @@
             myData->isEmptySegment = FALSE;	/* Any invalid char errors will be detected separately, so just reset this */
             if(myData->toU2022State.g == 1) {
                 if(mySource < mySourceLimit) {
+                    int leadIsOk, trailIsOk;
                     char trailByte;
 getTrailByte:
-                    trailByte = *mySource++;
-                    tempBuf[0] = (char)(mySourceChar + 0x80);
-                    tempBuf[1] = (char)(trailByte + 0x80);
-                    mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte);
-                    if((mySourceChar & 0x8080) == 0) {
-                        targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback);
+                    targetUniChar = missingCharMarker;
+                    trailByte = *mySource;
+                    /*
+                     * Ticket 5691: consistent illegal sequences:
+                     * - We include at least the first byte in the illegal sequence.
+                     * - If any of the non-initial bytes could be the start of a character,
+                     *   we stop the illegal sequence before the first one of those.
+                     *
+                     * In ISO-2022 DBCS, if both bytes are valid or both bytes are outside
+                     * the 21..7e range, then we treat them as a pair.
+                     * Otherwise (valid lead byte + illegal trail byte, or vice versa)
+                     * we report only the first byte as the illegal sequence.
+                     */
+                    leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
+                    trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
+                    if (leadIsOk == trailIsOk) {
+                        ++mySource;
+                        if (leadIsOk) {
+                            tempBuf[0] = (char)(mySourceChar + 0x80);
+                            tempBuf[1] = (char)(trailByte + 0x80);
+                            targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback);
+                        } else {
+                            leadIsOk = TRUE; /* TODO: remove */
+                        }
+                        mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte);
                     } else {
-                        /* illegal bytes > 0x7f */
-                        targetUniChar = missingCharMarker;
+                        trailIsOk = TRUE; /* TODO: remove */
                     }
                 } else {
                     args->converter->toUBytes[0] = (uint8_t)mySourceChar;
@@ -2601,8 +2653,10 @@
                     break;
                 }
             }
-            else{
+            else if(mySourceChar <= 0x7f) {
                 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySource - 1, 1, useFallback);
+            } else {
+                targetUniChar = 0xffff;
             }
             if(targetUniChar < 0xfffe){
                 if(args->offsets) {
@@ -3099,6 +3153,7 @@
         /* continue with a partial double-byte character */
         mySourceChar = args->converter->toUBytes[0];
         args->converter->toULength = 0;
+        targetUniChar = missingCharMarker;
         goto getTrailByte;
     }
 
@@ -3178,29 +3233,48 @@
                         UConverterSharedData *cnv;
                         StateEnum tempState;
                         int32_t tempBufLen;
+                        int leadIsOk, trailIsOk;
                         char trailByte;
 getTrailByte:
-                        trailByte = *mySource++;
-                        tempState = (StateEnum)pToU2022State->cs[pToU2022State->g];
-                        if(tempState > CNS_11643_0) {
-                            cnv = myData->myConverterArray[CNS_11643];
-                            tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0));
-                            tempBuf[1] = (char) (mySourceChar);
-                            tempBuf[2] = trailByte;
-                            tempBufLen = 3;
-
-                        }else{
-                            cnv = myData->myConverterArray[tempState];
-                            tempBuf[0] = (char) (mySourceChar);
-                            tempBuf[1] = trailByte;
-                            tempBufLen = 2;
+                        trailByte = *mySource;
+                        /*
+                         * Ticket 5691: consistent illegal sequences:
+                         * - We include at least the first byte in the illegal sequence.
+                         * - If any of the non-initial bytes could be the start of a character,
+                         *   we stop the illegal sequence before the first one of those.
+                         *
+                         * In ISO-2022 DBCS, if both bytes are valid or both bytes are outside
+                         * the 21..7e range, then we treat them as a pair.
+                         * Otherwise (valid lead byte + illegal trail byte, or vice versa)
+                         * we report only the first byte as the illegal sequence.
+                         */
+                        leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
+                        trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
+                        if (leadIsOk == trailIsOk) {
+                            ++mySource;
+                            if (leadIsOk) {
+                                tempState = (StateEnum)pToU2022State->cs[pToU2022State->g];
+                                if(tempState >= CNS_11643_0) {
+                                    cnv = myData->myConverterArray[CNS_11643];
+                                    tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0));
+                                    tempBuf[1] = (char) (mySourceChar);
+                                    tempBuf[2] = trailByte;
+                                    tempBufLen = 3;
+
+                                }else{
+                                    cnv = myData->myConverterArray[tempState];
+                                    tempBuf[0] = (char) (mySourceChar);
+                                    tempBuf[1] = trailByte;
+                                    tempBufLen = 2;
+                                }
+                                targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, FALSE);
+                            }
+                            mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte);
                         }
-                        mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte);
                         if(pToU2022State->g>=2) {
                             /* return from a single-shift state to the previous one */
                             pToU2022State->g=pToU2022State->prevG;
                         }
-                        targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, FALSE);
                     } else {
                         args->converter->toUBytes[0] = (uint8_t)mySourceChar;
                         args->converter->toULength = 1;
diff -ru icu.6175/source/common/ucnvhz.c icu/source/common/ucnvhz.c
--- icu.6175/source/common/ucnvhz.c	2009-06-11 13:44:44.000000000 +0100
+++ icu/source/common/ucnvhz.c	2009-06-11 18:05:36.000000000 +0100
@@ -215,19 +215,35 @@
                 }
                 else{
                     /* trail byte */
+                    int leadIsOk, trailIsOk;
                     uint32_t leadByte = args->converter->toUnicodeStatus & 0xff;
-                    if( (uint8_t)(leadByte - 0x21) <= (0x7d - 0x21) &&
-                        (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21)
-                    ) {
-                        tempBuf[0] = (char) (leadByte+0x80) ;
-                        tempBuf[1] = (char) (mySourceChar+0x80);
-                        targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->gbConverter->sharedData,
-                            tempBuf, 2, args->converter->useFallback);
+                    targetUniChar = 0xffff;
+                    /*
+                     * Ticket 5691: consistent illegal sequences:
+                     * - We include at least the first byte in the illegal sequence.
+                     * - If any of the non-initial bytes could be the start of a character,
+                     *   we stop the illegal sequence before the first one of those.
+                     *
+                     * In HZ DBCS, if both bytes are valid or both bytes are outside
+                     * the 21..7d/7e range, then we treat them as a pair.
+                     * Otherwise (valid lead byte + illegal trail byte, or vice versa)
+                     * we report only the first byte as the illegal sequence.
+                     */
+                    leadIsOk = (uint8_t)(leadByte - 0x21) <= (0x7d - 0x21);
+                    trailIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
+                    if (leadIsOk == trailIsOk) {
+                        if (leadIsOk) {
+                            tempBuf[0] = (char) (leadByte+0x80) ;
+                            tempBuf[1] = (char) (mySourceChar+0x80);
+                            targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->gbConverter->sharedData,
+                                tempBuf, 2, args->converter->useFallback);
+                        }
+                        /* add another bit so that the code below writes 2 bytes in case of error */
+                        mySourceChar= 0x10000 | (leadByte << 8) | mySourceChar;
                     } else {
-                        targetUniChar = 0xffff;
+                        --mySource;
+                        mySourceChar = (int32_t)leadByte;
                     }
-                    /* add another bit so that the code below writes 2 bytes in case of error */
-                    mySourceChar= 0x10000 | (leadByte << 8) | mySourceChar;
                     args->converter->toUnicodeStatus =0x00;
                 }
             }
diff -ru icu.6175/source/common/ucnvmbcs.c icu/source/common/ucnvmbcs.c
--- icu.6175/source/common/ucnvmbcs.c	2009-06-11 13:44:44.000000000 +0100
+++ icu/source/common/ucnvmbcs.c	2009-06-11 18:05:36.000000000 +0100
@@ -1,7 +1,7 @@
 /*
 ******************************************************************************
 *
-*   Copyright (C) 2000-2007, International Business Machines
+*   Copyright (C) 2000-2008, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 *
 ******************************************************************************
@@ -1791,6 +1791,65 @@
     pArgs->offsets=offsets;
 }
 
+static UBool
+hasValidTrailBytes(const int32_t (*stateTable)[256], uint8_t state) {
+    const int32_t *row=stateTable[state];
+    int32_t b, entry;
+    /* First test for final entries in this state for some commonly valid byte values. */
+    entry=row[0xa1];
+    if( !MBCS_ENTRY_IS_TRANSITION(entry) &&
+        MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL
+    ) {
+        return TRUE;
+    }
+    entry=row[0x41];
+    if( !MBCS_ENTRY_IS_TRANSITION(entry) &&
+        MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL
+    ) {
+        return TRUE;
+    }
+    /* Then test for final entries in this state. */
+    for(b=0; b<=0xff; ++b) {
+        entry=row[b];
+        if( !MBCS_ENTRY_IS_TRANSITION(entry) &&
+            MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL
+        ) {
+            return TRUE;
+        }
+    }
+    /* Then recurse for transition entries. */
+    for(b=0; b<=0xff; ++b) {
+        entry=row[b];
+        if( MBCS_ENTRY_IS_TRANSITION(entry) &&
+            hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry))
+        ) {
+            return TRUE;
+        }
+    }
+    return FALSE;
+}
+
+/*
+ * Is byte b a single/lead byte in this state?
+ * Recurse for transition states, because here we don't want to say that
+ * b is a lead byte if all byte sequences that start with b are illegal.
+ */
+static UBool
+isSingleOrLead(const int32_t (*stateTable)[256], uint8_t state, UBool isDBCSOnly, uint8_t b) {
+    const int32_t *row=stateTable[state];
+    int32_t entry=row[b];
+    if(MBCS_ENTRY_IS_TRANSITION(entry)) {   /* lead byte */
+        return hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry));
+    } else {
+        uint8_t action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
+        if(action==MBCS_STATE_CHANGE_ONLY && isDBCSOnly) {
+            return FALSE;   /* SI/SO are illegal for DBCS-only conversion */
+        } else {
+            return action!=MBCS_STATE_ILLEGAL;
+        }
+    }
+}
+
 U_CFUNC void
 ucnv_MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
                           UErrorCode *pErrorCode) {
@@ -2146,6 +2205,34 @@
             sourceIndex=nextSourceIndex;
         } else if(U_FAILURE(*pErrorCode)) {
             /* callback(illegal) */
+            if(byteIndex>1) {
+                /*
+                 * Ticket 5691: consistent illegal sequences:
+                 * - We include at least the first byte in the illegal sequence.
+                 * - If any of the non-initial bytes could be the start of a character,
+                 *   we stop the illegal sequence before the first one of those.
+                 */
+                UBool isDBCSOnly=(UBool)(cnv->sharedData->mbcs.dbcsOnlyState!=0);
+                int8_t i;
+                for(i=1;
+                    i<byteIndex && !isSingleOrLead(stateTable, state, isDBCSOnly, bytes[i]);
+                    ++i) {}
+                if(i<byteIndex) {
+                    /* Back out some bytes. */
+                    int8_t backOutDistance=byteIndex-i;
+                    int32_t bytesFromThisBuffer=(int32_t)(source-(const uint8_t *)pArgs->source);
+                    byteIndex=i;  /* length of reported illegal byte sequence */
+                    if(backOutDistance<=bytesFromThisBuffer) {
+                        source-=backOutDistance;
+                    } else {
+                        /* Back out bytes from the previous buffer: Need to replay them. */
+                        cnv->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance);
+                        /* preToULength is negative! */
+                        uprv_memcpy(cnv->preToU, bytes+i, -cnv->preToULength);
+                        source=(const uint8_t *)pArgs->source;
+                    }
+                }
+            }
             break;
         } else /* unassigned sequences indicated with byteIndex>0 */ {
             /* try an extension mapping */
@@ -2156,6 +2243,7 @@
                               &offsets, sourceIndex,
                               pArgs->flush,
                               pErrorCode);
+            /* TODO: nextSourceIndex+=diff instead of nextSourceIndex+diff ?? */
             sourceIndex=nextSourceIndex+(int32_t)(source-(const uint8_t *)pArgs->source);
 
             if(U_FAILURE(*pErrorCode)) {
@@ -2447,15 +2535,37 @@
 
     if(c<0) {
         if(U_SUCCESS(*pErrorCode) && source==sourceLimit && lastSource<source) {
-            *pErrorCode=U_TRUNCATED_CHAR_FOUND;
-        }
-        if(U_FAILURE(*pErrorCode)) {
             /* incomplete character byte sequence */
             uint8_t *bytes=cnv->toUBytes;
             cnv->toULength=(int8_t)(source-lastSource);
             do {
                 *bytes++=*lastSource++;
             } while(lastSource<source);
+            *pErrorCode=U_TRUNCATED_CHAR_FOUND;
+        } else if(U_FAILURE(*pErrorCode)) {
+            /* callback(illegal) */
+            /*
+             * Ticket 5691: consistent illegal sequences:
+             * - We include at least the first byte in the illegal sequence.
+             * - If any of the non-initial bytes could be the start of a character,
+             *   we stop the illegal sequence before the first one of those.
+             */
+            UBool isDBCSOnly=(UBool)(cnv->sharedData->mbcs.dbcsOnlyState!=0);
+            uint8_t *bytes=cnv->toUBytes;
+            *bytes++=*lastSource++;     /* first byte */
+            if(lastSource==source) {
+                cnv->toULength=1;
+            } else /* lastSource<source: multi-byte character */ {
+                int8_t i;
+                for(i=1;
+                    lastSource<source && !isSingleOrLead(stateTable, state, isDBCSOnly, *lastSource);
+                    ++i
+                ) {
+                    *bytes++=*lastSource++;
+                }
+                cnv->toULength=i;
+                source=lastSource;
+            }
         } else {
             /* no output because of empty input or only state changes */
             *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
diff -ru icu.6175/source/test/cintltst/nccbtst.c icu/source/test/cintltst/nccbtst.c
--- icu.6175/source/test/cintltst/nccbtst.c	2009-06-11 13:44:44.000000000 +0100
+++ icu/source/test/cintltst/nccbtst.c	2009-06-11 18:05:36.000000000 +0100
@@ -2497,13 +2497,13 @@
 
 
     static const uint8_t text943[] = {
-        0x82, 0xa9, 0x82, 0x20, /*0xc8,*/  0x61, 0x8a, 0xbf, 0x8e, 0x9a };
-    static const UChar toUnicode943sub[] = { 0x304b, 0xfffd, /*0xff88,*/ 0x0061, 0x6f22,  0x5b57};
-    static const UChar toUnicode943skip[]= { 0x304b, /*0xff88,*/ 0x0061, 0x6f22,  0x5b57};
+        0x82, 0xa9, 0x82, 0x20, 0x61, 0x8a, 0xbf, 0x8e, 0x9a };
+    static const UChar toUnicode943sub[] = { 0x304b, 0x1a, 0x20, 0x0061, 0x6f22,  0x5b57 };
+    static const UChar toUnicode943skip[]= { 0x304b, 0x20, 0x0061, 0x6f22,  0x5b57 };
     static const UChar toUnicode943stop[]= { 0x304b};
 
-    static const int32_t  fromIBM943Offssub[]  = {0, 2, 4, 5, 7};
-    static const int32_t  fromIBM943Offsskip[] = { 0, 4, 5, 7};
+    static const int32_t  fromIBM943Offssub[]  = { 0, 2, 3, 4, 5, 7 };
+    static const int32_t  fromIBM943Offsskip[] = { 0, 3, 4, 5, 7 };
     static const int32_t  fromIBM943Offsstop[] = { 0};
 
     gInBufferSize = inputsize;
@@ -2537,9 +2537,9 @@
 {
     static const uint8_t sampleText[] = {
         0x82, 0xa9, 0x61, 0x62, 0x63 , 0x82,
-        0xff, /*0x82, 0xa9,*/ 0x32, 0x33};
-    static const UChar toUnicode943sub[] = {0x304b, 0x0061, 0x0062, 0x0063,  0xfffd,/*0x304b,*/ 0x0032, 0x0033};
-    static const int32_t  fromIBM943Offssub[]  = {0, 2, 3, 4, 5, 7, 8};
+        0xff, 0x32, 0x33};
+    static const UChar toUnicode943sub[] = { 0x304b, 0x0061, 0x0062, 0x0063, 0x1a, 0x1a, 0x0032, 0x0033 };
+    static const int32_t fromIBM943Offssub[] = { 0, 2, 3, 4, 5, 6, 7, 8 };
     /*checking illegal value for ibm-943 with substitute*/ 
     gInBufferSize = inputsize;
     gOutBufferSize = outputsize;
diff -ru icu.6175/source/test/cintltst/nucnvtst.c icu/source/test/cintltst/nucnvtst.c
--- icu.6175/source/test/cintltst/nucnvtst.c	2009-06-11 13:44:44.000000000 +0100
+++ icu/source/test/cintltst/nucnvtst.c	2009-06-11 18:05:36.000000000 +0100
@@ -2608,7 +2608,7 @@
     TestNextUCharError(cnv, source, source, U_INDEX_OUTOFBOUNDS_ERROR, "sourceLimit <= source");
     /*Test for the condition where there is an invalid character*/
     {
-        static const uint8_t source2[]={0xa1, 0x01};
+        static const uint8_t source2[]={0xa1, 0x80};
         TestNextUCharError(cnv, (const char*)source2, (const char*)source2+sizeof(source2), U_ZERO_ERROR, "an invalid character");
     }
     /*Test for the condition where we have a truncated char*/
@@ -3901,11 +3901,11 @@
 TestISO_2022_KR() {
     /* test input */
     static const uint16_t in[]={
-                    0x9F4B,0x9F4E,0x9F52,0x9F5F,0x9F61,0x9F66,0x9F67,0x9F6A,0x000A,0x000D
-                   ,0x9F6C,0x9F77,0x9F8D,0x9F90,0x9F95,0x9F9C,0xAC00,0xAC01,0xAC02,0xAC04
+                    0x9F4B,0x9F4E,0x9F52,0x9F5F,0x9F61,0x9F67,0x9F6A,0x000A,0x000D
+                   ,0x9F6C,0x9F77,0x9F8D,0x9F90,0x9F95,0x9F9C,0xAC00,0xAC01,0xAC04
                    ,0xAC07,0xAC08,0xAC09,0x0025,0x0026,0x0027,0x000A,0x000D,0x0028,0x0029
                    ,0x002A,0x002B,0x002C,0x002D,0x002E,0x53C3,0x53C8,0x53C9,0x53CA,0x53CB
-                   ,0x53CD,0x53D4,0x53D6,0x53D7,0x53DB,0x000A,0x000D,0x53DF,0x53E1,0x53E2
+                   ,0x53CD,0x53D4,0x53D6,0x53D7,0x53DB,0x000A,0x000D,0x53E1,0x53E2
                    ,0x53E3,0x53E4,0x000A,0x000D};
     const UChar* uSource;
     const UChar* uSourceLimit;
diff -ru icu.6175/source/test/testdata/conversion.txt icu/source/test/testdata/conversion.txt
--- icu.6175/source/test/testdata/conversion.txt	2009-06-11 13:44:44.000000000 +0100
+++ icu/source/test/testdata/conversion.txt	2009-06-11 18:05:36.000000000 +0100
@@ -48,12 +48,83 @@
     toUnicode {
       Headers { "charset", "bytes", "unicode", "offsets", "flush", "fallbacks", "errorCode", "callback", "invalidChars" }
       Cases {
+        // Test ticket 5691: consistent illegal sequences
+        // Unfortunately, we cannot use the Shift-JIS examples from the ticket
+        // comments because our Shift-JIS table is Windows-compatible and
+        // therefore has no illegal single bytes. Same for GBK.
+        // Instead, we use the stricter GB 18030 also for 2-byte examples.
+        // The byte sequences are generally slightly different from the ticket
+        // comment, simply using assigned characters rather than just
+        // theoretically valid sequences.
+        {
+          "gb18030",
+          :bin{ 618140813c81ff7a },
+          "a\u4e02\\x81<\\x81\\xFFz",
+          :intvector{ 0,1,3,3,3,3,4,5,5,5,5,5,5,5,5,7 },
+          :int{1}, :int{0}, "", "&C", :bin{""}
+        }
+        {
+          "EUC-JP",
+          :bin{ 618fb0a98fb03c8f3cb0a97a },
+          "a\u4e28\\x8F\\xB0<\\x8F<\u9022z",
+          :intvector{ 0,1,4,4,4,4,5,5,5,5,6,7,7,7,7,8,9,11 },
+          :int{1}, :int{0}, "", "&C", :bin{""}
+        }
+        {
+          "gb18030",
+          :bin{ 618130fc318130fc8181303c3e813cfc817a },
+          "a\u05ed\\x810\u9f07\\x810<>\\x81<\u9f07z",
+          :intvector{ 0,1,5,5,5,5,6,7,9,9,9,9,10,11,12,13,13,13,13,14,15,17 },
+          :int{1}, :int{0}, "", "&C", :bin{""}
+        }
+        {
+          "UTF-8",
+          :bin{ 61f1808182f180813cf18081fff180ff3cf1ff3c3e7a },
+          "a\U00040042\\xF1\\x80\\x81<\\xF1\\x80\\x81\\xFF\\xF1\\x80\\xFF<\\xF1\\xFF<>z",
+          :intvector{ 0,1,1,5,5,5,5,5,5,5,5,5,5,5,5,8,9,9,9,9,9,9,9,9,9,9,9,9,12,12,12,12,13,13,13,13,13,13,13,13,15,15,15,15,16,17,17,17,17,18,18,18,18,19,20,21 },
+          :int{1}, :int{0}, "", "&C", :bin{""}
+        }
+        {
+          "ISO-2022-JP-2",
+          :bin{ 1b24424141af4142affe41431b2842 },
+          "\u758f\\xAF\u758e\\xAF\\xFE\u790e",
+          :intvector{ 3,5,5,5,5,6,8,8,8,8,8,8,8,8,10 },
+          :int{1}, :int{0}, "", "&C", :bin{""}
+        }
+        {
+          "ibm-25546",
+          :bin{ 411b242943420e4141af4142affe41430f5a },
+          "AB\uc88b\\xAF\uc88c\\xAF\\xFE\uc88dZ",
+          :intvector{ 0,5,7,9,9,9,9,10,12,12,12,12,12,12,12,12,14,17 },
+          :int{1}, :int{0}, "", "&C", :bin{""}
+        }
+        {
+          "ISO-2022-KR",
+          :bin{ 411b242943420e4141af4142affe41430f5a },
+          "AB\uc88b\\xAF\uc88c\\xAF\\xFE\uc88dZ",
+          :intvector{ 0,5,7,9,9,9,9,10,12,12,12,12,12,12,12,12,14,17 },
+          :int{1}, :int{0}, "", "&C", :bin{""}
+        }
+        {
+          "ISO-2022-CN",
+          :bin{ 411b242941420e4141af4142affe41430f5a },
+          "AB\u4eae\\xAF\u8c05\\xAF\\xFE\u64a9Z",
+          :intvector{ 0,5,7,9,9,9,9,10,12,12,12,12,12,12,12,12,14,17 },
+          :int{1}, :int{0}, "", "&C", :bin{""}
+        }
+        {
+          "HZ",
+          :bin{ 417e7b4141af4142affe41437e7d5a },
+          "A\u4eae\\xAF\u8c05\\xAF\\xFE\u64a9Z",
+          :intvector{ 0,3,5,5,5,5,6,8,8,8,8,8,8,8,8,10,14 },
+          :int{1}, :int{0}, "", "&C", :bin{""}
+        }
         // test that HZ limits its byte values to lead bytes 21..7d and trail bytes 21..7e
         {
           "HZ",
           :bin{ 7e7b21212120217e217f772100007e217e7d207e7e807e0a2b },
-          "\u3000\ufffd\u3013\ufffd\u9ccc\ufffd\ufffd ~\ufffd+",
-          :intvector{ 2,4,6,8,10,12,14,18,19,21,24 },
+          "\u3000\ufffd\ufffd\u3013\ufffd\ufffd\u9ccc\ufffd\ufffd ~\ufffd+",
+          :intvector{ 2,4,5,6,8,9,10,12,14,18,19,21,24 },
           :int{1}, :int{1}, "", "?", :bin{""}
         }
         // improve coverage of ISO-2022-JP converter with hardcoded JIS X 0201 and
@@ -61,8 +132,8 @@
         {
           "ISO-2022-JP",
           :bin{ 1b284a7d7e801b2442306c20217f7e21202160217f22202225227f5f211b2842 },
-          "}\u203e\ufffd\u4e00\ufffd\ufffd\ufffd\xf7\ufffd\ufffd\u25b2\ufffd\u6f3e",
-          :intvector{ 3,4,5,9,11,13,15,17,19,21,23,25,27 },
+          "}\u203e\ufffd\u4e00\ufffd\ufffd\ufffd\ufffd\ufffd\xf7\ufffd\ufffd\ufffd\ufffd\u25b2\ufffd\ufffd\u6f3e",
+          :intvector{ 3,4,5,9,11,12,13,14,16,17,19,20,21,22,23,25,26,27 },
           :int{1}, :int{1}, "", "?", :bin{""}
         }
         // improve coverage of unrolled loops in ucnvmbcs.c/ucnv_MBCSSingleToBMPWithOffsets()

icu.icu5797.backport.patch:

--- NEW FILE icu.icu5797.backport.patch ---
diff -ru icu.orig/source/common/ucnv2022.c icu/source/common/ucnv2022.c
--- icu.orig/source/common/ucnv2022.c	2009-06-11 12:21:48.000000000 +0100
+++ icu/source/common/ucnv2022.c	2009-06-11 12:24:48.000000000 +0100
@@ -472,8 +472,7 @@
             if(jpCharsetMasks[version]&CSM(ISO8859_7)) {
                 myConverterData->myConverterArray[ISO8859_7]= ucnv_loadSharedData("ISO8859_7", NULL, errorCode);
             }
-            myConverterData->myConverterArray[JISX201]      = ucnv_loadSharedData("JISX0201", NULL, errorCode);
-            myConverterData->myConverterArray[JISX208]      = ucnv_loadSharedData("jisx-208", NULL, errorCode);
+            myConverterData->myConverterArray[JISX208]      = ucnv_loadSharedData("Shift-JIS", NULL, errorCode);
             if(jpCharsetMasks[version]&CSM(JISX212)) {
                 myConverterData->myConverterArray[JISX212]  = ucnv_loadSharedData("jisx-212", NULL, errorCode);
             }
@@ -1040,14 +1039,6 @@
                 length=3;
             }
         }
-        /*
-         * TODO(markus): Use Shift-JIS table for JIS X 0208, to save mapping table space.
-         * Pass in parameter for type of output bytes, for validation and shifting:
-         * - Direct: Pass bytes through, but forbid control codes 00-1F (except SI/SO/ESC) and space 20?
-         *   (Need to allow some (TAB/LF/CR) or most of them for ASCII and maybe JIS X 0201.)
-         * - A1-FE: Subtract 80 after range check.
-         * - SJIS: Shift DBCS result to 21-7E x 21-7E.
-         */
         /* is this code point assigned, or do we use fallbacks? */
         if((stage2Entry&(1<<(16+(c&0xf))))!=0) {
             /* assigned */
@@ -1105,6 +1096,23 @@
     }
 }
 
+/*
+ * Check that the result is a 2-byte value with each byte in the range A1..FE
+ * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte
+ * to move it to the ISO 2022 range 21..7E.
+ * Return 0 if out of range.
+ */
+static U_INLINE uint32_t
+_2022FromGR94DBCS(uint32_t value) {
+    if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) &&
+        (uint8_t)(value - 0xa1) <= (0xfe - 0xa1)
+    ) {
+        return value - 0x8080;  /* shift down to 21..7e byte range */
+    } else {
+        return 0;  /* not valid for ISO 2022 */
+    }
+}
+
 #ifdef U_ENABLE_GENERIC_ISO_2022
 
 /**********************************************************************************
@@ -1233,7 +1241,7 @@
     }
     else{
         cnv->toUBytes[0] =(char) sourceChar;
-        cnv->toULength = 2;
+        cnv->toULength = 1;
     }
 
     if(targetUniChar == (missingCharMarker-1/*0xfffe*/)){
@@ -1344,6 +1352,181 @@
 * TODO: Implement a priority technique where the users are allowed to set the priority of code pages
 */
 
+/* Map 00..7F to Unicode according to JIS X 0201. */
+static U_INLINE uint32_t
+jisx201ToU(uint32_t value) {
+    if(value < 0x5c) {
+        return value;
+    } else if(value == 0x5c) {
+        return 0xa5;
+    } else if(value == 0x7e) {
+        return 0x203e;
+    } else /* value <= 0x7f */ {
+        return value;
+    }
+}
+
+/* Map Unicode to 00..7F according to JIS X 0201. Return U+FFFE if unmappable. */
+static U_INLINE uint32_t
+jisx201FromU(uint32_t value) {
+    if(value<=0x7f) {
+        if(value!=0x5c && value!=0x7e) {
+            return value;
+        }
+    } else if(value==0xa5) {
+        return 0x5c;
+    } else if(value==0x203e) {
+        return 0x7e;
+    }
+    return 0xfffe;
+}
+
+/*
+ * Take a valid Shift-JIS byte pair, check that it is in the range corresponding
+ * to JIS X 0208, and convert it to a pair of 21..7E bytes.
+ * Return 0 if the byte pair is out of range.
+ */
+static U_INLINE uint32_t
+_2022FromSJIS(uint32_t value) {
+    uint8_t trail;
+
+    if(value > 0xEFFC) {
+        return 0;  /* beyond JIS X 0208 */
+    }
+
+    trail = (uint8_t)value;
+
+    value &= 0xff00;  /* lead byte */
+    if(value <= 0x9f00) {
+        value -= 0x7000;
+    } else /* 0xe000 <= value <= 0xef00 */ {
+        value -= 0xb000;
+    }
+    value <<= 1;
+
+    if(trail <= 0x9e) {
+        value -= 0x100;
+        if(trail <= 0x7e) {
+            value |= trail - 0x1f;
+        } else {
+            value |= trail - 0x20;
+        }
+    } else /* trail <= 0xfc */ {
+        value |= trail - 0x7e;
+    }
+    return value;
+}
+
+/*
+ * Convert a pair of JIS X 0208 21..7E bytes to Shift-JIS.
+ * If either byte is outside 21..7E make sure that the result is not valid
+ * for Shift-JIS so that the converter catches it.
+ * Some invalid byte values already turn into equally invalid Shift-JIS
+ * byte values and need not be tested explicitly.
+ */
+static U_INLINE void
+_2022ToSJIS(uint8_t c1, uint8_t c2, char bytes[2]) {
+    if(c1&1) {
+        ++c1;
+        if(c2 <= 0x5f) {
+            c2 += 0x1f;
+        } else if(c2 <= 0x7e) {
+            c2 += 0x20;
+        } else {
+            c2 = 0;  /* invalid */
+        }
+    } else {
+        if((uint8_t)(c2-0x21) <= ((0x7e)-0x21)) {
+            c2 += 0x7e;
+        } else {
+            c2 = 0;  /* invalid */
+        }
+    }
+    c1 >>= 1;
+    if(c1 <= 0x2f) {
+        c1 += 0x70;
+    } else if(c1 <= 0x3f) {
+        c1 += 0xb0;
+    } else {
+        c1 = 0;  /* invalid */
+    }
+    bytes[0] = (char)c1;
+    bytes[1] = (char)c2;
+}
+
+/*
+ * JIS X 0208 has fallbacks from Unicode half-width Katakana to full-width (DBCS)
+ * Katakana.
+ * Now that we use a Shift-JIS table for JIS X 0208 we need to hardcode these fallbacks
+ * because Shift-JIS roundtrips half-width Katakana to single bytes.
+ * These were the only fallbacks in ICU's jisx-208.ucm file.
+ */
+static const uint16_t hwkana_fb[HWKANA_END - HWKANA_START + 1] = {
+    0x2123,  /* U+FF61 */
+    0x2156,
+    0x2157,
+    0x2122,
+    0x2126,
+    0x2572,
+    0x2521,
+    0x2523,
+    0x2525,
+    0x2527,
+    0x2529,
+    0x2563,
+    0x2565,
+    0x2567,
+    0x2543,
+    0x213C,  /* U+FF70 */
+    0x2522,
+    0x2524,
+    0x2526,
+    0x2528,
+    0x252A,
+    0x252B,
+    0x252D,
+    0x252F,
+    0x2531,
+    0x2533,
+    0x2535,
+    0x2537,
+    0x2539,
+    0x253B,
+    0x253D,
+    0x253F,  /* U+FF80 */
+    0x2541,
+    0x2544,
+    0x2546,
+    0x2548,
+    0x254A,
+    0x254B,
+    0x254C,
+    0x254D,
+    0x254E,
+    0x254F,
+    0x2552,
+    0x2555,
+    0x2558,
+    0x255B,
+    0x255E,
+    0x255F,  /* U+FF90 */
+    0x2560,
+    0x2561,
+    0x2562,
+    0x2564,
+    0x2566,
+    0x2568,
+    0x2569,
+    0x256A,
+    0x256B,
+    0x256C,
+    0x256D,
+    0x256F,
+    0x2573,
+    0x212B,
+    0x212C   /* U+FF9F */
+};
+
 static void
 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err) {
     UConverter *cnv = args->converter;
@@ -1499,7 +1682,7 @@
                     }
                     break;
                 case HWKANA_7BIT:
-                    if((uint32_t)(HWKANA_END-sourceChar)<=(HWKANA_END-HWKANA_START)) {
+                    if((uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
                         if(converterData->version==3) {
                             /* JIS7: use G1 (SO) */
                             /* Shift U+FF61..U+FF9F to bytes 21..5F. */
@@ -1526,13 +1709,34 @@
                     break;
                 case JISX201:
                     /* G0 SBCS */
-                    len2 = MBCS_SINGLE_FROM_UCHAR32(
+                    value = jisx201FromU(sourceChar);
+                    if(value <= 0x7f) {
+                        targetValue = value;
+                        len = 1;
+                        cs = cs0;
+                        g = 0;
+                        useFallback = FALSE;
+                    }
+                    break;
+                case JISX208:
+                    /* G0 DBCS from Shift-JIS table */
+                    len2 = MBCS_FROM_UCHAR32_ISO2022(
                                 converterData->myConverterArray[cs0],
                                 sourceChar, &value,
-                                useFallback);
-                    if(len2 != 0 && !(len2 < 0 && len != 0) && value <= 0x7f) {
-                        targetValue = value;
-                        len = len2;
+                                useFallback, MBCS_OUTPUT_2);
+                    if(len2 == 2 || (len2 == -2 && len == 0)) {  /* only accept DBCS: abs(len)==2 */
+                        value = _2022FromSJIS(value);
+                        if(value != 0) {
+                            targetValue = value;
+                            len = len2;
+                            cs = cs0;
+                            g = 0;
+                            useFallback = FALSE;
+                        }
+                    } else if(len == 0 && useFallback &&
+                              (uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
+                        targetValue = hwkana_fb[sourceChar - HWKANA_START];
+                        len = -2;
                         cs = cs0;
                         g = 0;
                         useFallback = FALSE;
@@ -1564,17 +1768,10 @@
                              * Check for valid bytes for the encoding scheme.
                              * This is necessary because the sub-converter (windows-949)
                              * has a broader encoding scheme than is valid for 2022.
-                             *
-                             * Check that the result is a 2-byte value with each byte in the range A1..FE
-                             * (strict EUC-KR DBCS) before accepting it and subtracting 0x80 from each byte
-                             * to move it to the ISO 2022 range 21..7E.
                              */
-                            if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) &&
-                                (uint8_t)(value - 0xa1) <= (0xfe - 0xa1)
-                            ) {
-                                value -= 0x8080;  /* shift down to 21..7e byte range */
-                            } else {
-                                break;  /* not valid for ISO 2022 */
+                            value = _2022FromGR94DBCS(value);
+                            if(value == 0) {
+                                break;
                             }
                         }
                         targetValue = value;
@@ -1750,7 +1947,7 @@
 static void
 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
                                                UErrorCode* err){
-    char tempBuf[3];
+    char tempBuf[2];
     const char *mySource = (char *) args->source;
     UChar *myTarget = args->target;
     const char *mySourceLimit = args->sourceLimit;
@@ -1868,10 +2065,7 @@
                     break;
                 case JISX201:
                     if(mySourceChar <= 0x7f) {
-                        targetUniChar =
-                            _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(
-                                myData->myConverterArray[cs],
-                                mySourceChar);
+                        targetUniChar = jisx201ToU(mySourceChar);
                     }
                     break;
                 case HWKANA_7BIT:
@@ -1885,8 +2079,13 @@
                     if(mySource < mySourceLimit) {
                         char trailByte;
 getTrailByte:
-                        tempBuf[0] = (char) (mySourceChar);
-                        tempBuf[1] = trailByte = *mySource++;
+                        trailByte = *mySource++;
+                        if(cs == JISX208) {
+                            _2022ToSJIS((uint8_t)mySourceChar, (uint8_t)trailByte, tempBuf);
+                        } else {
+                            tempBuf[0] = (char)mySourceChar;
+                            tempBuf[1] = trailByte;
+                        }
                         mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte);
                         targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE);
                     } else {
@@ -3190,6 +3389,9 @@
     /* open a set and initialize it with code points that are algorithmically round-tripped */
     switch(cnvData->locale[0]){
     case 'j':
+        /* include JIS X 0201 which is hardcoded */
+        sa->add(sa->set, 0xa5);
+        sa->add(sa->set, 0x203e);
         if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) {
             /* include Latin-1 for some variants of JP */
             sa->addRange(sa->set, 0, 0xff);
@@ -3198,6 +3400,11 @@
             sa->addRange(sa->set, 0, 0x7f);
         }
         if(jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT)) {
+            /*
+             * TODO(markus): If and when ucnv_getUnicodeSet() supports fallbacks,
+             * we need to include half-width Katakana for all JP variants because
+             * JIS X 0208 has hardcoded fallbacks for them.
+             */
             /* include half-width Katakana for JP */
             sa->addRange(sa->set, HWKANA_START, HWKANA_END);
         }
@@ -3217,15 +3424,7 @@
         break;
     }
 
-    /*
-     * Version-specific for CN:
-     * CN version 0 does not map CNS planes 3..7 although
-     * they are all available in the CNS conversion table;
-     * CN version 1 does map them all.
-     * The two versions create different Unicode sets.
-     */
-    for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
-        if(cnvData->myConverterArray[i]!=NULL) {
+#if 0  /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */
             if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
                 cnvData->version==0 && i==CNS_11643
             ) {
@@ -3235,9 +3434,33 @@
                         sa, UCNV_ROUNDTRIP_SET,
                         0, 0x81, 0x82,
                         pErrorCode);
+            }
+#endif
+
+    for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
+        UConverterSetFilter filter;
+        if(cnvData->myConverterArray[i]!=NULL) {
+            if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
+                cnvData->version==0 && i==CNS_11643
+            ) {
+                /*
+                 * Version-specific for CN:
+                 * CN version 0 does not map CNS planes 3..7 although
+                 * they are all available in the CNS conversion table;
+                 * CN version 1 (-EXT) does map them all.
+                 * The two versions create different Unicode sets.
+                 */
+                filter=UCNV_SET_FILTER_2022_CN;
+            } else if(cnvData->locale[0]=='j' && i==JISX208) {
+                /*
+                 * Only add code points that map to Shift-JIS codes
+                 * corresponding to JIS X 0208.
+                 */
+                filter=UCNV_SET_FILTER_SJIS;
             } else {
-                ucnv_MBCSGetUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, pErrorCode);
+                filter=UCNV_SET_FILTER_NONE;
             }
+            ucnv_MBCSGetFilteredUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, filter, pErrorCode);
         }
     }
 
diff -ru icu.orig/source/common/ucnvmbcs.c icu/source/common/ucnvmbcs.c
--- icu.orig/source/common/ucnvmbcs.c	2009-06-11 12:21:48.000000000 +0100
+++ icu/source/common/ucnvmbcs.c	2009-06-11 12:22:56.000000000 +0100
@@ -362,6 +362,8 @@
 
 /* Miscellaneous ------------------------------------------------------------ */
 
+#if 0  /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */
+
 /* similar to ucnv_MBCSGetNextUChar() but recursive */
 static void
 _getUnicodeSetForBytes(const UConverterSharedData *sharedData,
@@ -454,11 +456,14 @@
         pErrorCode);
 }
 
+#endif
+
 U_CFUNC void
-ucnv_MBCSGetUnicodeSetForUnicode(const UConverterSharedData *sharedData,
-                             const USetAdder *sa,
-                             UConverterUnicodeSet which,
-                             UErrorCode *pErrorCode) {
+ucnv_MBCSGetFilteredUnicodeSetForUnicode(const UConverterSharedData *sharedData,
+                                         const USetAdder *sa,
+                                         UConverterUnicodeSet which,
+                                         UConverterSetFilter filter,
+                                         UErrorCode *pErrorCode) {
     const UConverterMBCSTable *mbcsTable;
     const uint16_t *table;
 
@@ -512,50 +517,26 @@
                 c+=1024; /* empty stage 2 block */
             }
         }
-    } else if(mbcsTable->outputType==MBCS_OUTPUT_DBCS_ONLY) {
-        /* ignore single-byte results */
+    } else {
         const uint32_t *stage2;
-        const uint16_t *stage3, *results;
+        const uint8_t *stage3, *bytes;
+        uint32_t st3Multiplier;
+        uint32_t value;
 
-        results=(const uint16_t *)mbcsTable->fromUnicodeBytes;
-
-        for(st1=0; st1<maxStage1; ++st1) {
-            st2=table[st1];
-            if(st2>(maxStage1>>1)) {
-                stage2=(const uint32_t *)table+st2;
-                for(st2=0; st2<64; ++st2) {
-                    if((st3=stage2[st2])!=0) {
-                        /* read the stage 3 block */
-                        stage3=results+16*(uint32_t)(uint16_t)st3;
-
-                        /* get the roundtrip flags for the stage 3 block */
-                        st3>>=16;
+        bytes=mbcsTable->fromUnicodeBytes;
 
-                        /*
-                         * Add code points for which the roundtrip flag is set.
-                         * Once we get a set for fallback mappings, we have to check
-                         * non-roundtrip stage 3 results for whether they are 0.
-                         * See ucnv_MBCSFromUnicodeWithOffsets() for details.
-                         *
-                         * Ignore single-byte results (<0x100).
-                         */
-                        do {
-                            if((st3&1)!=0 && *stage3>=0x100) {
-                                sa->add(sa->set, c);
-                            }
-                            st3>>=1;
-                            ++stage3;
-                        } while((++c&0xf)!=0);
-                    } else {
-                        c+=16; /* empty stage 3 block */
-                    }
-                }
-            } else {
-                c+=1024; /* empty stage 2 block */
-            }
+        switch(mbcsTable->outputType) {
+        case MBCS_OUTPUT_3:
+        case MBCS_OUTPUT_4_EUC:
+            st3Multiplier=3;
+            break;
+        case MBCS_OUTPUT_4:
+            st3Multiplier=4;
+            break;
+        default:
+            st3Multiplier=2;
+            break;
         }
-    } else {
-        const uint32_t *stage2;
 
         for(st1=0; st1<maxStage1; ++st1) {
             st2=table[st1];
@@ -563,6 +544,9 @@
                 stage2=(const uint32_t *)table+st2;
                 for(st2=0; st2<64; ++st2) {
                     if((st3=stage2[st2])!=0) {
+                        /* read the stage 3 block */
+                        stage3=bytes+st3Multiplier*16*(uint32_t)(uint16_t)st3;
+
                         /* get the roundtrip flags for the stage 3 block */
                         st3>>=16;
 
@@ -572,12 +556,49 @@
                          * non-roundtrip stage 3 results for whether they are 0.
                          * See ucnv_MBCSFromUnicodeWithOffsets() for details.
                          */
-                        do {
-                            if(st3&1) {
-                                sa->add(sa->set, c);
-                            }
-                            st3>>=1;
-                        } while((++c&0xf)!=0);
+                        switch(filter) {
+                        case UCNV_SET_FILTER_NONE:
+                            do {
+                                if(st3&1) {
+                                    sa->add(sa->set, c);
+                                }
+                                st3>>=1;
+                            } while((++c&0xf)!=0);
+                            break;
+                        case UCNV_SET_FILTER_DBCS_ONLY:
+                             /* Ignore single-byte results (<0x100). */
+                            do {
+                                if((st3&1)!=0 && *((const uint16_t *)stage3)>=0x100) {
+                                    sa->add(sa->set, c);
+                                }
+                                st3>>=1;
+                                stage3+=2;  /* +=st3Multiplier */
+                            } while((++c&0xf)!=0);
+                            break;
+                        case UCNV_SET_FILTER_2022_CN:
+                             /* Only add code points that map to CNS 11643 planes 1 & 2 for non-EXT ISO-2022-CN. */
+                            do {
+                                if((st3&1)!=0 && ((value=*stage3)==0x81 || value==0x82)) {
+                                    sa->add(sa->set, c);
+                                }
+                                st3>>=1;
+                                stage3+=3;  /* +=st3Multiplier */
+                            } while((++c&0xf)!=0);
+                            break;
+                        case UCNV_SET_FILTER_SJIS:
+                             /* Only add code points that map to Shift-JIS codes corresponding to JIS X 0208. */
+                            do {
+                                if((st3&1)!=0 && (value=*((const uint16_t *)stage3))>=0x8140 && value<=0xeffc) {
+                                    sa->add(sa->set, c);
+                                }
+                                st3>>=1;
+                                stage3+=2;  /* +=st3Multiplier */
+                            } while((++c&0xf)!=0);
+                            break;
+                        default:
+                            *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
+                            return;
+                        }
                     } else {
                         c+=16; /* empty stage 3 block */
                     }
@@ -591,6 +612,19 @@
     ucnv_extGetUnicodeSet(sharedData, sa, which, pErrorCode);
 }
 
+U_CFUNC void
+ucnv_MBCSGetUnicodeSetForUnicode(const UConverterSharedData *sharedData,
+                                 const USetAdder *sa,
+                                 UConverterUnicodeSet which,
+                                 UErrorCode *pErrorCode) {
+    ucnv_MBCSGetFilteredUnicodeSetForUnicode(
+        sharedData, sa, which,
+        sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY ?
+            UCNV_SET_FILTER_DBCS_ONLY :
+            UCNV_SET_FILTER_NONE,
+        pErrorCode);
+}
+
 static void
 ucnv_MBCSGetUnicodeSet(const UConverter *cnv,
                    const USetAdder *sa,
diff -ru icu.orig/source/common/ucnvmbcs.h icu/source/common/ucnvmbcs.h
--- icu.orig/source/common/ucnvmbcs.h	2009-06-11 12:21:48.000000000 +0100
+++ icu/source/common/ucnvmbcs.h	2009-06-11 12:22:56.000000000 +0100
@@ -456,6 +456,7 @@
 ucnv_MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
                           UErrorCode *pErrorCode);
 
+#if 0  /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */
 /*
  * Internal function returning a UnicodeSet for toUnicode() conversion.
  * Currently only used for ISO-2022-CN, and only handles roundtrip mappings.
@@ -470,6 +471,7 @@
                            UConverterUnicodeSet which,
                            uint8_t state, int32_t lowByte, int32_t highByte,
                            UErrorCode *pErrorCode);
+#endif
 
 /*
  * Internal function returning a UnicodeSet for toUnicode() conversion.
@@ -481,9 +483,30 @@
  */
 U_CFUNC void
 ucnv_MBCSGetUnicodeSetForUnicode(const UConverterSharedData *sharedData,
-                             const USetAdder *sa,
-                             UConverterUnicodeSet which,
-                             UErrorCode *pErrorCode);
+                                 const USetAdder *sa,
+                                 UConverterUnicodeSet which,
+                                 UErrorCode *pErrorCode);
+
+typedef enum UConverterSetFilter {
+    UCNV_SET_FILTER_NONE,
+    UCNV_SET_FILTER_DBCS_ONLY,
+    UCNV_SET_FILTER_2022_CN,
+    UCNV_SET_FILTER_SJIS,
+    UCNV_SET_FILTER_COUNT
+} UConverterSetFilter;
+
+/*
+ * Same as ucnv_MBCSGetUnicodeSetForUnicode() but
+ * the set can be filtered by encoding scheme.
+ * Used by stateful converters which share regular conversion tables
+ * but only use a subset of their mappings.
+ */
+U_CFUNC void
+ucnv_MBCSGetFilteredUnicodeSetForUnicode(const UConverterSharedData *sharedData,
+                                         const USetAdder *sa,
+                                         UConverterUnicodeSet which,
+                                         UConverterSetFilter filter,
+                                         UErrorCode *pErrorCode);
 
 #endif
 
diff -ru icu.orig/source/test/cintltst/nucnvtst.c icu/source/test/cintltst/nucnvtst.c
--- icu.orig/source/test/cintltst/nucnvtst.c	2009-06-11 12:21:47.000000000 +0100
+++ icu/source/test/cintltst/nucnvtst.c	2009-06-11 12:23:19.000000000 +0100
@@ -3202,7 +3202,7 @@
         0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004A, 0x000D, 0x000A,
         0x004B, 0x004C, 0x004D, 0x004E, 0x004F, 0x0050, 0x0051, 0x0052, 0x000D, 0x000A,
         0x3005, 0x3006, 0x3007, 0x30FC, 0x2015, 0x2010, 0xFF0F, 0x005C, 0x000D, 0x000A,
-        0x301C, 0x2016, 0x2026, 0x2025, 0x2018, 0x2019, 0x201C, 0x000D, 0x000A,
+        0x3013, 0x2018, 0x2026, 0x2025, 0x2018, 0x2019, 0x201C, 0x000D, 0x000A,
         0x201D, 0x3014, 0x000D, 0x000A,
         0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005A, 0x000D, 0x000A,
         0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005A, 0x000D, 0x000A,
@@ -3730,7 +3730,7 @@
         0x52C8, 0x52CC, 0x52CF, 0x52D1, 0x52D4, 0x52D6, 0x52DB, 0x52DC, 0x000D, 0x000A,
         0x004B, 0x004C, 0x004D, 0x004E, 0x004F, 0x0050, 0x0051, 0x0052, 0x000D, 0x000A,
         0x3005, 0x3006, 0x3007, 0x30FC, 0x2015, 0x2010, 0xFF0F, 0x005C, 0x000D, 0x000A,
-        0x301C, 0x2016, 0x2026, 0x2025, 0x2018, 0x2019, 0x201C, 0x000D, 0x000A,
+        0x3013, 0x2018, 0x2026, 0x2025, 0x2018, 0x2019, 0x201C, 0x000D, 0x000A,
         0x201D, 0x000D, 0x000A,
         0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005A, 0x000D, 0x000A,
         0x4F94, 0x4F97, 0x52BA, 0x52BB, 0x52BD, 0x52C0, 0x52C4, 0x52C6, 0x000D, 0x000A,
diff -ru icu.orig/source/test/cintltst/udatatst.c icu/source/test/cintltst/udatatst.c
--- icu.orig/source/test/cintltst/udatatst.c	2009-06-11 12:21:47.000000000 +0100
+++ icu/source/test/cintltst/udatatst.c	2009-06-11 12:23:19.000000000 +0100
@@ -1281,7 +1281,7 @@
      * MBCS conversion table file without extension,
      * to test swapping and preflighting of UTF-8-friendly mbcsIndex[].
      */
-    {"jisx-208",                 "cnv", ucnv_swap},
+    {"jisx-212",                 "cnv", ucnv_swap},
 #endif
 
 #if !UCONFIG_NO_CONVERSION
diff -ru icu.orig/source/test/testdata/conversion.txt icu/source/test/testdata/conversion.txt
--- icu.orig/source/test/testdata/conversion.txt	2009-06-11 12:21:48.000000000 +0100
+++ icu/source/test/testdata/conversion.txt	2009-06-11 12:22:56.000000000 +0100
@@ -48,6 +48,15 @@
     toUnicode {
       Headers { "charset", "bytes", "unicode", "offsets", "flush", "fallbacks", "errorCode", "callback", "invalidChars" }
       Cases {
+        // improve coverage of ISO-2022-JP converter with hardcoded JIS X 0201 and
+        // using the Shift-JIS table for JIS X 0208 (ticket #5797)
+        {
+          "ISO-2022-JP",
+          :bin{ 1b284a7d7e801b2442306c20217f7e21202160217f22202225227f5f211b2842 },
+          "}\u203e\ufffd\u4e00\ufffd\ufffd\ufffd\xf7\ufffd\ufffd\u25b2\ufffd\u6f3e",
+          :intvector{ 3,4,5,9,11,13,15,17,19,21,23,25,27 },
+          :int{1}, :int{1}, "", "?", :bin{""}
+        }
         // improve coverage of unrolled loops in ucnvmbcs.c/ucnv_MBCSSingleToBMPWithOffsets()
         {
           "ISO-8859-3",
@@ -495,6 +504,15 @@
     fromUnicode {
       Headers { "charset", "unicode", "bytes", "offsets", "flush", "fallbacks", "errorCode", "callback", "invalidUChars" }
       Cases {
+        // improve coverage of ISO-2022-JP converter with hardcoded JIS X 0201 and
+        // using the Shift-JIS table for JIS X 0208 (ticket #5797)
+        {
+          "ISO-2022-JP",
+          "\u203e\xa5\u4e00\ufa10\u6f3e\u0391",
+          :bin{       1b284a7e5c1b2442306c222e5f2126211b2842 },
+          :intvector{ 0,0,0,0,1,2,2,2,2,2,3,3,4,4,5,5,5,5,5 },
+          :int{1}, :int{0}, "", "?=\u3013", ""  // U+3013 Geta Mark converts to 222e
+        }
         // Verify that mappings that would result in byte values outside 20..7F (for SBCS)
         // or 21..7E (for DBCS) are not used.
         // ibm-9005_X110-2007.ucm (ISO 8859-7, <ESC>.F=1b2e46):
@@ -1293,13 +1311,13 @@
         // versions of ISO-2022-JP
         {
           "ISO-2022-JP",
-          "[\x00-\x0d\x10-\x1a\x1c-\x7f\u0391-\u03a1\uff61-\uff9f\u4e00\u4e01\uffe5]",
-          "[\x0e\x0f\x1b\u0100-\u0113\u0385-\u038a\u4e02\u4e27-\u4e29\uffe6-\U0010ffff]",
+          "[\x00-\x0d\x10-\x1a\x1c-\x7f\xa5\u0391-\u03a1\u203e\uff61-\uff9f\u4e00\u4e01\uffe5]",
+          "[\x0e\x0f\x1b\u0100-\u0113\u0385-\u038a\u4e02\u4e27-\u4e29\ufa0e-\ufa2d\uffe6-\U0010ffff]",
           :int{0}
         }   
         {
           "ISO-2022-JP-2",
-          "[\x00-\x0d\x10-\x1a\x1c-\u0113\u0384-\u0386\u0388-\u038a\u0390-\u03a1\uff61-\uff9f\u4e00-\u4e05\uffe6]",
+          "[\x00-\x0d\x10-\x1a\x1c-\u0113\u0384-\u0386\u0388-\u038a\u0390-\u03a1\u203e\uff61-\uff9f\u4e00-\u4e05\uffe6]",
           "[\x0e\x0f\x1b\uffe7-\U0010ffff]",
           :int{0}
         }

icu.icu6001.backport.patch:

--- NEW FILE icu.icu6001.backport.patch ---
diff -ru icu.icu5797/source/common/ucnv2022.c icu/source/common/ucnv2022.c
--- icu.icu5797/source/common/ucnv2022.c	2009-06-11 12:26:42.000000000 +0100
+++ icu/source/common/ucnv2022.c	2009-06-11 13:21:14.000000000 +0100
@@ -3399,11 +3399,19 @@
             /* include ASCII for JP */
             sa->addRange(sa->set, 0, 0x7f);
         }
-        if(jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT)) {
+        if(cnvData->version==3 || cnvData->version==4 || which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) {
             /*
-             * TODO(markus): If and when ucnv_getUnicodeSet() supports fallbacks,
-             * we need to include half-width Katakana for all JP variants because
-             * JIS X 0208 has hardcoded fallbacks for them.
+             * Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))!=0
+             * because the bit is on for all JP versions although only versions 3 & 4 (JIS7 & JIS8)
+             * use half-width Katakana.
+             * This is because all ISO-2022-JP variants are lenient in that they accept (in toUnicode)
+             * half-width Katakana via the ESC ( I sequence.
+             * However, we only emit (fromUnicode) half-width Katakana according to the
+             * definition of each variant.
+             *
+             * When including fallbacks,
+             * we need to include half-width Katakana Unicode code points for all JP variants because
+             * JIS X 0208 has hardcoded fallbacks for them (which map to full-width Katakana).
              */
             /* include half-width Katakana for JP */
             sa->addRange(sa->set, HWKANA_START, HWKANA_END);
@@ -3457,6 +3465,12 @@
                  * corresponding to JIS X 0208.
                  */
                 filter=UCNV_SET_FILTER_SJIS;
+            } else if(i==KSC5601) {
+                /*
+                 * Some of the KSC 5601 tables (convrtrs.txt has this aliases on multiple tables)
+                 * are broader than GR94.
+                 */
+                filter=UCNV_SET_FILTER_GR94DBCS;
             } else {
                 filter=UCNV_SET_FILTER_NONE;
             }
@@ -3472,6 +3486,9 @@
     sa->remove(sa->set, 0x0e);
     sa->remove(sa->set, 0x0f);
     sa->remove(sa->set, 0x1b);
+
+    /* ISO 2022 converters do not convert C1 controls either */
+    sa->removeRange(sa->set, 0x80, 0x9f);
 }
 
 static const UConverterImpl _ISO2022Impl={
diff -ru icu.icu5797/source/common/ucnv_ext.c icu/source/common/ucnv_ext.c
--- icu.icu5797/source/common/ucnv_ext.c	2009-06-11 12:26:42.000000000 +0100
+++ icu/source/common/ucnv_ext.c	2009-06-11 13:21:14.000000000 +0100
@@ -946,7 +946,7 @@
 ucnv_extGetUnicodeSetString(const UConverterSharedData *sharedData,
                             const int32_t *cx,
                             const USetAdder *sa,
-                            UConverterUnicodeSet which,
+                            UBool useFallback,
                             int32_t minLength,
                             UChar32 c,
                             UChar s[UCNV_EXT_MAX_UCHARS], int32_t length,
@@ -966,7 +966,7 @@
     value=*fromUSectionValues++;
 
     if( value!=0 &&
-        UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) &&
+        (UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) || useFallback) &&
         UCNV_EXT_FROM_U_GET_LENGTH(value)>=minLength
     ) {
         if(c>=0) {
@@ -987,12 +987,14 @@
             /* no mapping, do nothing */
         } else if(UCNV_EXT_FROM_U_IS_PARTIAL(value)) {
             ucnv_extGetUnicodeSetString(
-                sharedData, cx, sa, which, minLength,
+                sharedData, cx, sa, useFallback, minLength,
                 U_SENTINEL, s, length+1,
                 (int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value),
                 pErrorCode);
-        } else if(((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_EXT_FROM_U_RESERVED_MASK))==
-                           UCNV_EXT_FROM_U_ROUNDTRIP_FLAG) &&
+        } else if((useFallback ?
+                      (value&UCNV_EXT_FROM_U_RESERVED_MASK)==0 :
+                      ((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_EXT_FROM_U_RESERVED_MASK))==
+                          UCNV_EXT_FROM_U_ROUNDTRIP_FLAG)) &&
                   UCNV_EXT_FROM_U_GET_LENGTH(value)>=minLength
         ) {
             sa->addString(sa->set, s, length+1);
@@ -1004,6 +1006,7 @@
 ucnv_extGetUnicodeSet(const UConverterSharedData *sharedData,
                       const USetAdder *sa,
                       UConverterUnicodeSet which,
+                      UConverterSetFilter filter,
                       UErrorCode *pErrorCode) {
     const int32_t *cx;
     const uint16_t *stage12, *stage3, *ps2, *ps3;
@@ -1011,6 +1014,7 @@
 
     uint32_t value;
     int32_t st1, stage1Length, st2, st3, minLength;
+    UBool useFallback;
 
     UChar s[UCNV_EXT_MAX_UCHARS];
     UChar32 c;
@@ -1027,12 +1031,20 @@
 
     stage1Length=cx[UCNV_EXT_FROM_U_STAGE_1_LENGTH];
 
+    useFallback=(UBool)(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET);
+
     /* enumerate the from-Unicode trie table */
     c=0; /* keep track of the current code point while enumerating */
 
-    if(sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY) {
+    if( sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY ||
+        filter==UCNV_SET_FILTER_DBCS_ONLY ||
+        filter==UCNV_SET_FILTER_SJIS ||
+        filter==UCNV_SET_FILTER_GR94DBCS
+    ) {
         /* DBCS-only, ignore single-byte results */
         minLength=2;
+    } else if(filter==UCNV_SET_FILTER_2022_CN) {
+        minLength=3;
     } else {
         minLength=1;
     }
@@ -1064,14 +1076,41 @@
                             length=0;
                             U16_APPEND_UNSAFE(s, length, c);
                             ucnv_extGetUnicodeSetString(
-                                sharedData, cx, sa, which, minLength,
+                                sharedData, cx, sa, useFallback, minLength,
                                 c, s, length,
                                 (int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value),
                                 pErrorCode);
-                        } else if(((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_EXT_FROM_U_RESERVED_MASK))==
-                                           UCNV_EXT_FROM_U_ROUNDTRIP_FLAG) &&
+                        } else if((useFallback ?
+                                      (value&UCNV_EXT_FROM_U_RESERVED_MASK)==0 :
+                                      ((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_EXT_FROM_U_RESERVED_MASK))==
+                                          UCNV_EXT_FROM_U_ROUNDTRIP_FLAG)) &&
                                   UCNV_EXT_FROM_U_GET_LENGTH(value)>=minLength
                         ) {
+                            switch(filter) {
+                            case UCNV_SET_FILTER_2022_CN:
+                                if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==3 && UCNV_EXT_FROM_U_GET_DATA(value)<=0x82ffff)) {
+                                    continue;
+                                }
+                                break;
+                            case UCNV_SET_FILTER_SJIS:
+                                if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==2 && (value=UCNV_EXT_FROM_U_GET_DATA(value))>=0x8140 && value<=0xeffc)) {
+                                    continue;
+                                }
+                                break;
+                            case UCNV_SET_FILTER_GR94DBCS:
+                                if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==2 &&
+                                     (uint16_t)((value=UCNV_EXT_FROM_U_GET_DATA(value))-0xa1a1)<=(0xfefe-0xa1a1) &&
+                                     (uint8_t)(value-0xa1)<=(0xfe-0xa1))) {
+                                    continue;
+                                }
+                                break;
+                            default:
+                                /*
+                                 * UCNV_SET_FILTER_NONE,
+                                 * or UCNV_SET_FILTER_DBCS_ONLY which is handled via minLength
+                                 */
+                                break;
+                            }
                             sa->add(sa->set, c);
                         }
                     } while((++c&0xf)!=0);
diff -ru icu.icu5797/source/common/ucnv_ext.h icu/source/common/ucnv_ext.h
--- icu.icu5797/source/common/ucnv_ext.h	2009-06-11 12:26:42.000000000 +0100
+++ icu/source/common/ucnv_ext.h	2009-06-11 13:21:14.000000000 +0100
@@ -382,10 +382,20 @@
                            UConverterFromUnicodeArgs *pArgs, int32_t srcIndex,
                            UErrorCode *pErrorCode);
 
+/*
+ * Add code points and strings to the set according to the extension mappings.
+ * Limitation on the UConverterSetFilter:
+ * The filters currently assume that they are used with 1:1 mappings.
+ * They only apply to single input code points, and then they pass through
+ * only mappings with single-charset-code results.
+ * For example, the Shift-JIS filter only works for 2-byte results and tests
+ * that those 2 bytes are in the JIS X 0208 range of Shift-JIS.
+ */
 U_CFUNC void
 ucnv_extGetUnicodeSet(const UConverterSharedData *sharedData,
                       const USetAdder *sa,
                       UConverterUnicodeSet which,
+                      UConverterSetFilter filter,
                       UErrorCode *pErrorCode);
 
 /* toUnicode helpers -------------------------------------------------------- */
diff -ru icu.icu5797/source/common/ucnvhz.c icu/source/common/ucnvhz.c
--- icu.icu5797/source/common/ucnvhz.c	2009-06-11 12:26:42.000000000 +0100
+++ icu/source/common/ucnvhz.c	2009-06-11 13:21:14.000000000 +0100
@@ -1,6 +1,6 @@
 /*  
 **********************************************************************
-*   Copyright (C) 2000-2006, International Business Machines
+*   Copyright (C) 2000-2007, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 **********************************************************************
 *   file name:  ucnvhz.c
@@ -528,6 +528,7 @@
     sa->add(sa->set, 0x7e);
 
     /* add all of the code points that the sub-converter handles */
+    /* ucnv_MBCSGetFilteredUnicodeSetForUnicode(((UConverterDataHZ*)cnv->extraInfo)->gbConverter->sharedData, sa, which, UCNV_SET_FILTER_GR94DBCS, pErrorCode); */
     ((UConverterDataHZ*)cnv->extraInfo)->
         gbConverter->sharedData->impl->
             getUnicodeSet(((UConverterDataHZ*)cnv->extraInfo)->gbConverter,
diff -ru icu.icu5797/source/common/ucnv_lmb.c icu/source/common/ucnv_lmb.c
--- icu.icu5797/source/common/ucnv_lmb.c	2009-06-11 12:26:42.000000000 +0100
+++ icu/source/common/ucnv_lmb.c	2009-06-11 13:21:33.000000000 +0100
@@ -1,6 +1,6 @@
 /*  
 **********************************************************************
-*   Copyright (C) 2000-2006, International Business Machines
+*   Copyright (C) 2000-2007, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 **********************************************************************
 *   file name:  ucnv_lmb.cpp
@@ -536,7 +536,7 @@
     NULL,\
     NULL,\
     _LMBCSSafeClone,\
-    _LMBCSGetUnicodeSet\
+    ucnv_getCompleteUnicodeSet\
 };\
 static const UConverterStaticData _LMBCSStaticData##n={\
   sizeof(UConverterStaticData),\
@@ -662,15 +662,14 @@
     return &newLMBCS->cnv;
 }
 
-static void
-_LMBCSGetUnicodeSet(const UConverter *cnv,
-                   const USetAdder *sa,
-                   UConverterUnicodeSet which,
-                   UErrorCode *pErrorCode) {
-    /* all but U+F6xx, see LMBCS explanation above (search for F6xx) */
-    sa->addRange(sa->set, 0, 0xf5ff);
-    sa->addRange(sa->set, 0xf700, 0x10ffff);
-}
+/*
+ * There used to be a _LMBCSGetUnicodeSet() function here (up to svn revision 20117)
+ * which added all code points except for U+F6xx
+ * because those cannot be represented in the Unicode group.
+ * However, it turns out that windows-950 has roundtrips for all of U+F6xx
+ * which means that LMBCS can convert all Unicode code points after all.
+ * We now simply use ucnv_getCompleteUnicodeSet().
+ */
 
 /* 
    Here's the basic helper function that we use when converting from
diff -ru icu.icu5797/source/common/ucnvmbcs.c icu/source/common/ucnvmbcs.c
--- icu.icu5797/source/common/ucnvmbcs.c	2009-06-11 12:26:42.000000000 +0100
+++ icu/source/common/ucnvmbcs.c	2009-06-11 13:21:14.000000000 +0100
@@ -485,9 +485,23 @@
 
     if(mbcsTable->outputType==MBCS_OUTPUT_1) {
         const uint16_t *stage2, *stage3, *results;
+        uint16_t minValue;
 
         results=(const uint16_t *)mbcsTable->fromUnicodeBytes;
 
+        /*
+         * Set a threshold variable for selecting which mappings to use.
+         * See ucnv_MBCSSingleFromBMPWithOffsets() and
+         * MBCS_SINGLE_RESULT_FROM_U() for details.
+         */
+        if(which==UCNV_ROUNDTRIP_SET) {
+            /* use only roundtrips */
+            minValue=0xf00;
+        } else /* UCNV_ROUNDTRIP_AND_FALLBACK_SET */ {
+            /* use all roundtrip and fallback results */
+            minValue=0x800;
+        }
+
         for(st1=0; st1<maxStage1; ++st1) {
             st2=table[st1];
             if(st2>maxStage1) {
@@ -497,15 +511,8 @@
                         /* read the stage 3 block */
                         stage3=results+st3;
 
-                        /*
-                         * Add code points for which the roundtrip flag is set.
-                         * Once we get a set for fallback mappings, we have to use
-                         * a threshold variable with a value of 0x800.
-                         * See ucnv_MBCSSingleFromBMPWithOffsets() and
-                         * MBCS_SINGLE_RESULT_FROM_U() for details.
-                         */
                         do {
-                            if(*stage3++>=0xf00) {
+                            if(*stage3++>=minValue) {
                                 sa->add(sa->set, c);
                             }
                         } while((++c&0xf)!=0);
@@ -522,9 +529,12 @@
         const uint8_t *stage3, *bytes;
         uint32_t st3Multiplier;
         uint32_t value;
+        UBool useFallback;
 
         bytes=mbcsTable->fromUnicodeBytes;
 
+        useFallback=(UBool)(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET);
+
         switch(mbcsTable->outputType) {
         case MBCS_OUTPUT_3:
         case MBCS_OUTPUT_4_EUC:
@@ -551,9 +561,8 @@
                         st3>>=16;
 
                         /*
-                         * Add code points for which the roundtrip flag is set.
-                         * Once we get a set for fallback mappings, we have to check
-                         * non-roundtrip stage 3 results for whether they are 0.
+                         * Add code points for which the roundtrip flag is set,
+                         * or which map to non-zero bytes if we use fallbacks.
                          * See ucnv_MBCSFromUnicodeWithOffsets() for details.
                          */
                         switch(filter) {
@@ -561,6 +570,23 @@
                             do {
                                 if(st3&1) {
                                     sa->add(sa->set, c);
+                                    stage3+=st3Multiplier;
+                                } else if(useFallback) {
+                                    uint8_t b=0;
+                                    switch(st3Multiplier) {
+                                    case 4:
+                                        b|=*stage3++;
+                                    case 3:
+                                        b|=*stage3++;
+                                    case 2:
+                                        b|=stage3[0]|stage3[1];
+                                        stage3+=2;
+                                    default:
+                                        break;
+                                    }
+                                    if(b!=0) {
+                                        sa->add(sa->set, c);
+                                    }
                                 }
                                 st3>>=1;
                             } while((++c&0xf)!=0);
@@ -568,7 +594,7 @@
                         case UCNV_SET_FILTER_DBCS_ONLY:
                              /* Ignore single-byte results (<0x100). */
                             do {
-                                if((st3&1)!=0 && *((const uint16_t *)stage3)>=0x100) {
+                                if(((st3&1)!=0 || useFallback) && *((const uint16_t *)stage3)>=0x100) {
                                     sa->add(sa->set, c);
                                 }
                                 st3>>=1;
@@ -578,7 +604,7 @@
                         case UCNV_SET_FILTER_2022_CN:
                              /* Only add code points that map to CNS 11643 planes 1 & 2 for non-EXT ISO-2022-CN. */
                             do {
-                                if((st3&1)!=0 && ((value=*stage3)==0x81 || value==0x82)) {
+                                if(((st3&1)!=0 || useFallback) && ((value=*stage3)==0x81 || value==0x82)) {
                                     sa->add(sa->set, c);
                                 }
                                 st3>>=1;
@@ -588,7 +614,20 @@
                         case UCNV_SET_FILTER_SJIS:
                              /* Only add code points that map to Shift-JIS codes corresponding to JIS X 0208. */
                             do {
-                                if((st3&1)!=0 && (value=*((const uint16_t *)stage3))>=0x8140 && value<=0xeffc) {
+                                if(((st3&1)!=0 || useFallback) && (value=*((const uint16_t *)stage3))>=0x8140 && value<=0xeffc) {
+                                    sa->add(sa->set, c);
+                                }
+                                st3>>=1;
+                                stage3+=2;  /* +=st3Multiplier */
+                            } while((++c&0xf)!=0);
+                            break;
+                        case UCNV_SET_FILTER_GR94DBCS:
+                            /* Only add code points that map to ISO 2022 GR 94 DBCS codes (each byte A1..FE). */
+                            do {
+                                if( ((st3&1)!=0 || useFallback) &&
+                                    (uint16_t)((value=*((const uint16_t *)stage3))-0xa1a1)<=(0xfefe-0xa1a1) &&
+                                    (uint8_t)(value-0xa1)<=(0xfe-0xa1)
+                                ) {
                                     sa->add(sa->set, c);
                                 }
                                 st3>>=1;
@@ -609,7 +648,7 @@
         }
     }
 
-    ucnv_extGetUnicodeSet(sharedData, sa, which, pErrorCode);
+    ucnv_extGetUnicodeSet(sharedData, sa, which, filter, pErrorCode);
 }
 
 U_CFUNC void
diff -ru icu.icu5797/source/common/ucnvmbcs.h icu/source/common/ucnvmbcs.h
--- icu.icu5797/source/common/ucnvmbcs.h	2009-06-11 12:26:42.000000000 +0100
+++ icu/source/common/ucnvmbcs.h	2009-06-11 13:21:14.000000000 +0100
@@ -492,6 +492,7 @@
     UCNV_SET_FILTER_DBCS_ONLY,
     UCNV_SET_FILTER_2022_CN,
     UCNV_SET_FILTER_SJIS,
+    UCNV_SET_FILTER_GR94DBCS,
     UCNV_SET_FILTER_COUNT
 } UConverterSetFilter;
 
diff -ru icu.icu5797/source/common/ucnv_set.c icu/source/common/ucnv_set.c
--- icu.icu5797/source/common/ucnv_set.c	2009-06-11 12:26:42.000000000 +0100
+++ icu/source/common/ucnv_set.c	2009-06-11 13:21:14.000000000 +0100
@@ -1,7 +1,7 @@
 /*
 *******************************************************************************
 *
-*   Copyright (C) 2003-2005, International Business Machines
+*   Copyright (C) 2003-2007, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 *
 *******************************************************************************
@@ -52,7 +52,8 @@
             uset_add,
             uset_addRange,
             uset_addString,
-            uset_remove
+            uset_remove,
+            uset_removeRange
         };
         sa.set=setFillIn;
 
diff -ru icu.icu5797/source/common/unicode/ucnv.h icu/source/common/unicode/ucnv.h
--- icu.icu5797/source/common/unicode/ucnv.h	2009-06-11 12:26:42.000000000 +0100
+++ icu/source/common/unicode/ucnv.h	2009-06-11 13:21:14.000000000 +0100
@@ -870,6 +870,8 @@
 typedef enum UConverterUnicodeSet {
     /** Select the set of roundtrippable Unicode code points. @stable ICU 2.6 */
     UCNV_ROUNDTRIP_SET,
+    /** Select the set of Unicode code points with roundtrip or fallback mappings. @draft ICU 4.0 */
+    UCNV_ROUNDTRIP_AND_FALLBACK_SET,
     /** Number of UConverterUnicodeSet selectors. @stable ICU 2.6 */
     UCNV_SET_COUNT
 } UConverterUnicodeSet;
@@ -878,11 +880,16 @@
 /**
  * Returns the set of Unicode code points that can be converted by an ICU converter.
  *
- * The current implementation returns only one kind of set (UCNV_ROUNDTRIP_SET):
+ * Returns one of several kinds of set:
+ *
+ * 1. UCNV_ROUNDTRIP_SET
+ *
  * The set of all Unicode code points that can be roundtrip-converted
- * (converted without any data loss) with the converter.
+ * (converted without any data loss) with the converter (ucnv_fromUnicode()).
  * This set will not include code points that have fallback mappings
  * or are only the result of reverse fallback mappings.
+ * This set will also not include PUA code points with fallbacks, although
+ * ucnv_fromUnicode() will always uses those mappings despite ucnv_setFallback().
  * See UTR #22 "Character Mapping Markup Language"
  * at http://www.unicode.org/reports/tr22/
  *
@@ -893,6 +900,12 @@
  *   by comparing its roundtrip set with the set of ExemplarCharacters from
  *   ICU's locale data or other sources
  *
+ * 2. UCNV_ROUNDTRIP_AND_FALLBACK_SET
+ *
+ * The set of all Unicode code points that can be converted with the converter (ucnv_fromUnicode())
+ * when fallbacks are turned on (see ucnv_setFallback()).
+ * This set includes all code points with roundtrips and fallbacks (but not reverse fallbacks).
+ *
  * In the future, there may be more UConverterUnicodeSet choices to select
  * sets with different properties.
  *
diff -ru icu.icu5797/source/common/uset_imp.h icu/source/common/uset_imp.h
--- icu.icu5797/source/common/uset_imp.h	2009-06-11 12:26:42.000000000 +0100
+++ icu/source/common/uset_imp.h	2009-06-11 13:21:14.000000000 +0100
@@ -36,6 +36,9 @@
 typedef void U_CALLCONV
 USetRemove(USet *set, UChar32 c);
 
+typedef void U_CALLCONV
+USetRemoveRange(USet *set, UChar32 start, UChar32 end);
+
 /**
  * Interface for adding items to a USet, to keep low-level code from
  * statically depending on the USet implementation.
@@ -47,6 +50,7 @@
     USetAddRange *addRange;
     USetAddString *addString;
     USetRemove *remove;
+    USetRemoveRange *removeRange;
 };
 typedef struct USetAdder USetAdder;
 
diff -ru icu.icu5797/source/test/intltest/convtest.cpp icu/source/test/intltest/convtest.cpp
--- icu.icu5797/source/test/intltest/convtest.cpp	2009-06-11 12:26:42.000000000 +0100
+++ icu/source/test/intltest/convtest.cpp	2009-06-11 13:21:51.000000000 +0100
@@ -70,6 +70,7 @@
         case 0: name="TestToUnicode"; if (exec) TestToUnicode(); break;
         case 1: name="TestFromUnicode"; if (exec) TestFromUnicode(); break;
         case 2: name="TestGetUnicodeSet"; if (exec) TestGetUnicodeSet(); break;
+        case 3: name="TestGetUnicodeSet2"; if (exec) TestGetUnicodeSet2(); break;
         default: name=""; break; //needed to end loop
     }
 }
@@ -465,6 +466,183 @@
     }
 }
 
+U_CDECL_BEGIN
+static void U_CALLCONV
+getUnicodeSetCallback(const void *context,
+                      UConverterFromUnicodeArgs *fromUArgs,
+                      const UChar* codeUnits,
+                      int32_t length,
+                      UChar32 codePoint,
+                      UConverterCallbackReason reason,
+                      UErrorCode *pErrorCode) {
+    if(reason<=UCNV_IRREGULAR) {
+        ((UnicodeSet *)context)->remove(codePoint);  // the converter cannot convert this code point
+        *pErrorCode=U_ZERO_ERROR;                    // skip
+    }  // else ignore the reset, close and clone calls.
+}
+U_CDECL_END
+
+// Compare ucnv_getUnicodeSet() with the set of characters that can be converted.
+void
+ConversionTest::TestGetUnicodeSet2() {
+    // Build a string with all code points.
+    UChar32 cpLimit;
+    int32_t s0Length;
+    if(quick) {
+        cpLimit=s0Length=0x10000;  // BMP only
+    } else {
+        cpLimit=0x110000;
+        s0Length=0x10000+0x200000;  // BMP + surrogate pairs
+    }
+    UChar *s0=new UChar[s0Length];
+    if(s0==NULL) {
+        return;
+    }
+    UChar *s=s0;
+    UChar32 c;
+    UChar c2;
+    // low BMP
+    for(c=0; c<=0xd7ff; ++c) {
+        *s++=(UChar)c;
+    }
+    // trail surrogates
+    for(c=0xdc00; c<=0xdfff; ++c) {
+        *s++=(UChar)c;
+    }
+    // lead surrogates
+    // (after trails so that there is not even one surrogate pair in between)
+    for(c=0xd800; c<=0xdbff; ++c) {
+        *s++=(UChar)c;
+    }
+    // high BMP
+    for(c=0xe000; c<=0xffff; ++c) {
+        *s++=(UChar)c;
+    }
+    // supplementary code points = surrogate pairs
+    if(cpLimit==0x110000) {
+        for(c=0xd800; c<=0xdbff; ++c) {
+            for(c2=0xdc00; c2<=0xdfff; ++c2) {
+                *s++=(UChar)c;
+                *s++=c2;
+            }
+        }
+    }
+
+    static const char *const cnvNames[]={
+        "UTF-8",
+        "UTF-7",
+        "UTF-16",
+        "US-ASCII",
+        "ISO-8859-1",
+        "windows-1252",
+        "Shift-JIS",
+        "ibm-1390",  // EBCDIC_STATEFUL table
+        "ibm-16684",  // DBCS-only extension table based on EBCDIC_STATEFUL table
+        // "HZ", TODO(markus): known bug, the set incorrectly contains [\u02CA\u02CB\u02D9\u2010\u2013\u2015...]
+        "ISO-2022-JP",
+        "JIS7",
+        "ISO-2022-CN",
+        "ISO-2022-CN-EXT",
+        "LMBCS"
+    };
+    char buffer[1024];
+    int32_t i;
+    for(i=0; i<LENGTHOF(cnvNames); ++i) {
+        UErrorCode errorCode=U_ZERO_ERROR;
+        UConverter *cnv=cnv_open(cnvNames[i], errorCode);
+        if(U_FAILURE(errorCode)) {
+            errln("failed to open converter %s - %s", cnvNames[i], u_errorName(errorCode));
+            continue;
+        }
+        UnicodeSet expected;
+        ucnv_setFromUCallBack(cnv, getUnicodeSetCallback, &expected, NULL, NULL, &errorCode);
+        if(U_FAILURE(errorCode)) {
+            errln("failed to set the callback on converter %s - %s", cnvNames[i], u_errorName(errorCode));
+            ucnv_close(cnv);
+            continue;
+        }
+        UConverterUnicodeSet which;
+        for(which=UCNV_ROUNDTRIP_SET; which<UCNV_SET_COUNT; which=(UConverterUnicodeSet)((int)which+1)) {
+            if(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) {
+                ucnv_setFallback(cnv, TRUE);
+            }
+            expected.add(0, cpLimit-1);
+            s=s0;
+            UBool flush;
+            do {
+                char *t=buffer;
+                flush=(UBool)(s==s0+s0Length);
+                ucnv_fromUnicode(cnv, &t, buffer+sizeof(buffer), (const UChar **)&s, s0+s0Length, NULL, flush, &errorCode);
+                if(U_FAILURE(errorCode)) {
+                    if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
+                        errorCode=U_ZERO_ERROR;
+                        continue;
+                    } else {
+                        break;  // unexpected error, should not occur
+                    }
+                }
+            } while(!flush);
+            UnicodeSet set;
+            ucnv_getUnicodeSet(cnv, (USet *)&set, which, &errorCode);
+            if(cpLimit<0x110000) {
+                set.remove(cpLimit, 0x10ffff);
+            }
+            if(which==UCNV_ROUNDTRIP_SET) {
+                // ignore PUA code points because they will be converted even if they
+                // are fallbacks and when other fallbacks are turned off,
+                // but ucnv_getUnicodeSet(UCNV_ROUNDTRIP_SET) delivers true roundtrips
+                expected.remove(0xe000, 0xf8ff);
+                expected.remove(0xf0000, 0xffffd);
+                expected.remove(0x100000, 0x10fffd);
+                set.remove(0xe000, 0xf8ff);
+                set.remove(0xf0000, 0xffffd);
+                set.remove(0x100000, 0x10fffd);
+            }
+            if(set!=expected) {
+                // First try to see if we have different sets because ucnv_getUnicodeSet()
+                // added strings: The above conversion method does not tell us what strings might be convertible.
+                // Remove strings from the set and compare again.
+                // Unfortunately, there are no good, direct set methods for finding out whether there are strings
+                // in the set, nor for enumerating or removing just them.
+                // Intersect all code points with the set. The intersection will not contain strings.
+                UnicodeSet temp(0, 0x10ffff);
+                temp.retainAll(set);
+                set=temp;
+            }
+            if(set!=expected) {
+                UnicodeSet diffSet;
+                UnicodeString out;
+
+                // are there items that must be in the set but are not?
+                (diffSet=expected).removeAll(set);
+                if(!diffSet.isEmpty()) {
+                    diffSet.toPattern(out, TRUE);
+                    if(out.length()>100) {
+                        out.replace(100, 0x7fffffff, ellipsis, LENGTHOF(ellipsis));
+                    }
+                    errln("error: ucnv_getUnicodeSet(\"%s\") is missing items - which set: %d",
+                            cnvNames[i], which);
+                    errln(out);
+                }
+
+                // are there items that must not be in the set but are?
+                (diffSet=set).removeAll(expected);
+                if(!diffSet.isEmpty()) {
+                    diffSet.toPattern(out, TRUE);
+                    if(out.length()>100) {
+                        out.replace(100, 0x7fffffff, ellipsis, LENGTHOF(ellipsis));
+                    }
+                    errln("error: ucnv_getUnicodeSet(\"%s\") contains unexpected items - which set: %d",
+                            cnvNames[i], which);
+                    errln(out);
+                }
+            }
+        }
+    }
+
+    delete [] s0;
+}
+
 // open testdata or ICU data converter ------------------------------------- ***
 
 UConverter *
diff -ru icu.icu5797/source/test/intltest/convtest.h icu/source/test/intltest/convtest.h
--- icu.icu5797/source/test/intltest/convtest.h	2009-06-11 12:26:42.000000000 +0100
+++ icu/source/test/intltest/convtest.h	2009-06-11 13:21:13.000000000 +0100
@@ -72,6 +72,7 @@
     void TestToUnicode();
     void TestFromUnicode();
     void TestGetUnicodeSet();
+    void TestGetUnicodeSet2();
 
 private:
     UBool
diff -ru icu.icu5797/source/test/testdata/conversion.txt icu/source/test/testdata/conversion.txt
--- icu.icu5797/source/test/testdata/conversion.txt	2009-06-11 12:26:42.000000000 +0100
+++ icu/source/test/testdata/conversion.txt	2009-06-11 13:21:33.000000000 +0100
@@ -1311,16 +1311,29 @@
         // versions of ISO-2022-JP
         {
           "ISO-2022-JP",
-          "[\x00-\x0d\x10-\x1a\x1c-\x7f\xa5\u0391-\u03a1\u203e\uff61-\uff9f\u4e00\u4e01\uffe5]",
-          "[\x0e\x0f\x1b\u0100-\u0113\u0385-\u038a\u4e02\u4e27-\u4e29\ufa0e-\ufa2d\uffe6-\U0010ffff]",
+          "[\x00-\x0d\x10-\x1a\x1c-\x7f\xa5\u0391-\u03a1\u2015\u203e\u4e00\u4e01\uffe5]",
+          "[\x0e\x0f\x1b\u0100-\u0113\u0385-\u038a\u2014\u301c\u4e02\u4e27-\u4e29\u4fe0\u663b\u9eb5\ufa0e-\ufa2d\uff61-\uff9f\uffe4\uffe6-\U0010ffff]",
           :int{0}
-        }   
+        }
         {
           "ISO-2022-JP-2",
-          "[\x00-\x0d\x10-\x1a\x1c-\u0113\u0384-\u0386\u0388-\u038a\u0390-\u03a1\u203e\uff61-\uff9f\u4e00-\u4e05\uffe6]",
-          "[\x0e\x0f\x1b\uffe7-\U0010ffff]",
+          "[\x00-\x0d\x10-\x1a\x1c-\x7f\xa0-\u0113\u0384-\u0386\u0388-\u038a\u0390-\u03a1\u203e\u4e00-\u4e05\u4fe0\u663b\uffe6]",
+          "[\x0e\x0f\x1b\uff61-\uff9f\uffe4\uffe7-\U0010ffff]",
+          :int{0}
+        }
+        {
+          "JIS7",
+          "[\x00-\x0d\x10-\x1a\x1c-\x7f\xa0-\u0113\u0384-\u0386\u0388-\u038a\u0390-\u03a1\u203e\u4e00-\u4e05\u4fe0\u663b\uff61-\uff9f\uffe6]",
+          "[\x0e\x0f\x1b\uffe4\uffe7-\U0010ffff]",
           :int{0}
         }
+        // with fallbacks
+        {
+          "ISO-2022-JP",
+          "[\x00-\x0d\x10-\x1a\x1c-\x7f\xa5\u0391-\u03a1\u2014\u2015\u203e\u301c\u4e00\u4e01\u4fe0\u9eb5\uff61-\uff9f\uffe5]",
+          "[\x0e\x0f\x1b\xa6\u0100-\u0113\u0385-\u038a\u4e02\u4e27-\u4e29\u663b\ufa0e-\ufa2d\uffe4\uffe6-\U0010ffff]",
+          :int{1}
+        }
 
         // versions of ISO-2022-CN
         {
@@ -1352,6 +1365,14 @@
           :int{0}
         }
 
+        // LMBCS
+        {
+          "LMBCS",
+          "[\x00-\U0010ffff]",
+          "[]",
+          :int{0}
+        }
+
         // extensions
         {
           "ibm-1390",

icu.icu6002.backport.patch:

--- NEW FILE icu.icu6002.backport.patch ---
diff -ru icu.6001/source/common/ucnv_ext.c icu/source/common/ucnv_ext.c
--- icu.6001/source/common/ucnv_ext.c	2009-06-11 13:22:41.000000000 +0100
+++ icu/source/common/ucnv_ext.c	2009-06-11 13:30:06.000000000 +0100
@@ -1036,15 +1036,13 @@
     /* enumerate the from-Unicode trie table */
     c=0; /* keep track of the current code point while enumerating */
 
-    if( sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY ||
-        filter==UCNV_SET_FILTER_DBCS_ONLY ||
-        filter==UCNV_SET_FILTER_SJIS ||
-        filter==UCNV_SET_FILTER_GR94DBCS
+    if(filter==UCNV_SET_FILTER_2022_CN) {
+        minLength=3;
+    } else if( sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY ||
+               filter!=UCNV_SET_FILTER_NONE
     ) {
         /* DBCS-only, ignore single-byte results */
         minLength=2;
-    } else if(filter==UCNV_SET_FILTER_2022_CN) {
-        minLength=3;
     } else {
         minLength=1;
     }
@@ -1099,8 +1097,15 @@
                                 break;
                             case UCNV_SET_FILTER_GR94DBCS:
                                 if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==2 &&
-                                     (uint16_t)((value=UCNV_EXT_FROM_U_GET_DATA(value))-0xa1a1)<=(0xfefe-0xa1a1) &&
-                                     (uint8_t)(value-0xa1)<=(0xfe-0xa1))) {
+                                     (uint16_t)((value=UCNV_EXT_FROM_U_GET_DATA(value))-0xa1a1)<=(0xfefe - 0xa1a1) &&
+                                     (uint8_t)(value-0xa1)<=(0xfe - 0xa1))) {
+                                    continue;
+                                }
+                                break;
+                            case UCNV_SET_FILTER_HZ:
+                                if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==2 &&
+                                     (uint16_t)((value=UCNV_EXT_FROM_U_GET_DATA(value))-0xa1a1)<=(0xfdfe - 0xa1a1) &&
+                                     (uint8_t)(value-0xa1)<=(0xfe - 0xa1))) {
                                     continue;
                                 }
                                 break;
diff -ru icu.6001/source/common/ucnvhz.c icu/source/common/ucnvhz.c
--- icu.6001/source/common/ucnvhz.c	2009-06-11 13:22:41.000000000 +0100
+++ icu/source/common/ucnvhz.c	2009-06-11 13:30:03.000000000 +0100
@@ -72,7 +72,7 @@
     cnv->extraInfo = uprv_malloc(sizeof(UConverterDataHZ));
     if(cnv->extraInfo != NULL){
         uprv_memset(cnv->extraInfo, 0, sizeof(UConverterDataHZ));
-        ((UConverterDataHZ*)cnv->extraInfo)->gbConverter = ucnv_open("ibm-1386",errorCode);
+        ((UConverterDataHZ*)cnv->extraInfo)->gbConverter = ucnv_open("GBK",errorCode);
     }
     else {
         *errorCode = U_MEMORY_ALLOCATION_ERROR;
@@ -141,7 +141,7 @@
     UChar *myTarget = args->target;
     const char *mySourceLimit = args->sourceLimit;
     UChar32 targetUniChar = 0x0000;
-    UChar mySourceChar = 0x0000;
+    int32_t mySourceChar = 0x0000;
     UConverterDataHZ* myData=(UConverterDataHZ*)(args->converter->extraInfo);
     tempBuf[0]=0; 
     tempBuf[1]=0;
@@ -156,90 +156,71 @@
             
             mySourceChar= (unsigned char) *mySource++;
 
-            switch(mySourceChar){
+            if(args->converter->mode == UCNV_TILDE) {
+                /* second byte after ~ */
+                args->converter->mode=0;
+                switch(mySourceChar) {
                 case 0x0A:
-                    if(args->converter->mode ==UCNV_TILDE){
-                        args->converter->mode=0;
-                        
-                    }
-                    *(myTarget++)=(UChar)mySourceChar;
+                    /* no output for ~\n (line-continuation marker) */
                     continue;
-            
                 case UCNV_TILDE:
-                    if(args->converter->mode ==UCNV_TILDE){
-                        *(myTarget++)=(UChar)mySourceChar;
-                        args->converter->mode=0;
-                        continue;
-                        
+                    if(args->offsets) {
+                        args->offsets[myTarget - args->target]=(int32_t)(mySource - args->source - 2);
                     }
-                    else if(args->converter->toUnicodeStatus !=0){
-                        args->converter->mode=0;
-                        break;
-                    }
-                    else{
-                        args->converter->mode = UCNV_TILDE;
-                        continue;
-                    }
-                
-                
+                    *(myTarget++)=(UChar)mySourceChar;
+                    continue;
                 case UCNV_OPEN_BRACE:
-                    if(args->converter->mode == UCNV_TILDE){
-                        args->converter->mode=0;
-                        myData->isStateDBCS = TRUE;
-                        continue;
-                    }
-                    else{
-                        break;
-                    }
-               
-                
+                    myData->isStateDBCS = TRUE;
+                    continue;
                 case UCNV_CLOSE_BRACE:
-                    if(args->converter->mode == UCNV_TILDE){
-                        args->converter->mode=0;
-                         myData->isStateDBCS = FALSE;
-                        continue;
-                    }
-                    else{
-                        break;
-                    }
-                
+                    myData->isStateDBCS = FALSE;
+                    continue;
                 default:
                      /* if the first byte is equal to TILDE and the trail byte
                      * is not a valid byte then it is an error condition
                      */
-                    if(args->converter->mode == UCNV_TILDE){
-                        args->converter->mode=0;
-                        mySourceChar= (UChar)(((UCNV_TILDE+0x80) << 8) | ((mySourceChar & 0x00ff)+0x80));
-                        goto SAVE_STATE;
-                    }
-                    
+                    mySourceChar = 0x7e00 | mySourceChar;
+                    targetUniChar = 0xffff;
                     break;
-
-            }
-             
-            if(myData->isStateDBCS){
+                }
+            } else if(myData->isStateDBCS) {
                 if(args->converter->toUnicodeStatus == 0x00){
-                    args->converter->toUnicodeStatus = (UChar) mySourceChar;
+                    /* lead byte */
+                    if(mySourceChar == UCNV_TILDE) {
+                        args->converter->mode = UCNV_TILDE;
+                    } else {
+                        /* add another bit to distinguish a 0 byte from not having seen a lead byte */
+                        args->converter->toUnicodeStatus = (uint32_t) (mySourceChar | 0x100);
+                    }
                     continue;
                 }
                 else{
-                    tempBuf[0] = (char) (args->converter->toUnicodeStatus+0x80) ;
-                    tempBuf[1] = (char) (mySourceChar+0x80);
-                    mySourceChar= (UChar)(((args->converter->toUnicodeStatus+0x80) << 8) | ((mySourceChar & 0x00ff)+0x80));
+                    /* trail byte */
+                    uint32_t leadByte = args->converter->toUnicodeStatus & 0xff;
+                    if( (uint8_t)(leadByte - 0x21) <= (0x7d - 0x21) &&
+                        (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21)
+                    ) {
+                        tempBuf[0] = (char) (leadByte+0x80) ;
+                        tempBuf[1] = (char) (mySourceChar+0x80);
+                        targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->gbConverter->sharedData,
+                            tempBuf, 2, args->converter->useFallback);
+                    } else {
+                        targetUniChar = 0xffff;
+                    }
+                    /* add another bit so that the code below writes 2 bytes in case of error */
+                    mySourceChar= 0x10000 | (leadByte << 8) | mySourceChar;
                     args->converter->toUnicodeStatus =0x00;
-                    targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->gbConverter->sharedData,
-                        tempBuf, 2, args->converter->useFallback);
                 }
             }
             else{
-                if(args->converter->fromUnicodeStatus == 0x00){
-                    targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->gbConverter->sharedData,
-                        mySource - 1, 1, args->converter->useFallback);
-                }
-                else{
-                    goto SAVE_STATE;
+                if(mySourceChar == UCNV_TILDE) {
+                    args->converter->mode = UCNV_TILDE;
+                    continue;
+                } else if(mySourceChar <= 0x7f) {
+                    targetUniChar = (UChar)mySourceChar;  /* ASCII */
+                } else {
+                    targetUniChar = 0xffff;
                 }
-
             }
             if(targetUniChar < 0xfffe){
                 if(args->offsets) {
@@ -248,26 +229,17 @@
 
                 *(myTarget++)=(UChar)targetUniChar;
             }
-            else if(targetUniChar>=0xfffe){
-SAVE_STATE:
+            else /* targetUniChar>=0xfffe */ {
                 if(targetUniChar == 0xfffe){
                     *err = U_INVALID_CHAR_FOUND;
                 }
                 else{
                     *err = U_ILLEGAL_CHAR_FOUND;
                 }
-                if(myData->isStateDBCS){
-                    /* this should never occur since isStateDBCS is set to true 
-                     * only after tempBuf[0] and tempBuf[1]
-                     * are set to the input ..  just to please BEAM 
-                     */
-                    if(tempBuf[0]==0 || tempBuf[1]==0){
-                        *err = U_INTERNAL_PROGRAM_ERROR;
-                    }else{
-                        args->converter->toUBytes[0] = (uint8_t)(tempBuf[0]-0x80);
-                        args->converter->toUBytes[1] = (uint8_t)(tempBuf[1]-0x80);
-                        args->converter->toULength=2;
-                    }
+                if(mySourceChar > 0xff){
+                    args->converter->toUBytes[0] = (uint8_t)(mySourceChar >> 8);
+                    args->converter->toUBytes[1] = (uint8_t)mySourceChar;
+                    args->converter->toULength=2;
                 }
                 else{
                     args->converter->toUBytes[0] = (uint8_t)mySourceChar;
@@ -328,16 +300,21 @@
                 escSeq = TILDE_ESCAPE;
                 CONCAT_ESCAPE_MACRO(args, myTargetIndex, targetLength, escSeq,err,len,mySourceIndex);
                 continue;
-            }
-            else{
+            } else if(mySourceChar <= 0x7f) {
+                length = 1;
+                targetUniChar = mySourceChar;
+            } else {
                 length= ucnv_MBCSFromUChar32(myConverterData->gbConverter->sharedData,
                     mySourceChar,&targetUniChar,args->converter->useFallback);
-
-            }
-            /* only DBCS or SBCS characters are expected*/
-            /* DB haracters with high bit set to 1 are expected */
-            if(length > 2 || length==0 ||(((targetUniChar & 0x8080) != 0x8080)&& length==2)){
-                targetUniChar= missingCharMarker;
+                /* we can only use lead bytes 21..7D and trail bytes 21..7E */
+                if( length == 2 &&
+                    (uint16_t)(targetUniChar - 0xa1a1) <= (0xfdfe - 0xa1a1) &&
+                    (uint8_t)(targetUniChar - 0xa1) <= (0xfe - 0xa1)
+                ) {
+                    targetUniChar -= 0x8080;
+                } else {
+                    targetUniChar = missingCharMarker;
+                }
             }
             if (targetUniChar != missingCharMarker){
                myConverterData->isTargetUCharDBCS = isTargetUCharDBCS = (UBool)(targetUniChar>0x00FF);     
@@ -360,22 +337,22 @@
             
                 if(isTargetUCharDBCS){
                     if( myTargetIndex <targetLength){
-                        myTarget[myTargetIndex++] =(char) ((targetUniChar >> 8) -0x80);
+                        myTarget[myTargetIndex++] =(char) (targetUniChar >> 8);
                         if(offsets){
                             *(offsets++) = mySourceIndex-1;
                         }
                         if(myTargetIndex < targetLength){
-                            myTarget[myTargetIndex++] =(char) ((targetUniChar & 0x00FF) -0x80);
+                            myTarget[myTargetIndex++] =(char) targetUniChar;
                             if(offsets){
                                 *(offsets++) = mySourceIndex-1;
                             }
                         }else{
-                            args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (char) ((targetUniChar & 0x00FF) -0x80);
+                            args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (char) targetUniChar;
                             *err = U_BUFFER_OVERFLOW_ERROR;
                         } 
                     }else{
-                        args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] =(char) ((targetUniChar >> 8) -0x80);
-                        args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (char) ((targetUniChar & 0x00FF) -0x80);
+                        args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] =(char) (targetUniChar >> 8);
+                        args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (char) targetUniChar;
                         *err = U_BUFFER_OVERFLOW_ERROR;
                     }
 
@@ -524,15 +501,14 @@
                   const USetAdder *sa,
                   UConverterUnicodeSet which,
                   UErrorCode *pErrorCode) {
-    /* the tilde '~' is hardcoded in the converter */
-    sa->add(sa->set, 0x7e);
+    /* HZ converts all of ASCII */
+    sa->addRange(sa->set, 0, 0x7f);
 
     /* add all of the code points that the sub-converter handles */
-    /* ucnv_MBCSGetFilteredUnicodeSetForUnicode(((UConverterDataHZ*)cnv->extraInfo)->gbConverter->sharedData, sa, which, UCNV_SET_FILTER_GR94DBCS, pErrorCode); */
-    ((UConverterDataHZ*)cnv->extraInfo)->
-        gbConverter->sharedData->impl->
-            getUnicodeSet(((UConverterDataHZ*)cnv->extraInfo)->gbConverter,
-                          sa, which, pErrorCode);
+    ucnv_MBCSGetFilteredUnicodeSetForUnicode(
+        ((UConverterDataHZ*)cnv->extraInfo)->gbConverter->sharedData,
+        sa, which, UCNV_SET_FILTER_HZ,
+        pErrorCode);
 }
 
 static const UConverterImpl _HZImpl={
diff -ru icu.6001/source/common/ucnvmbcs.c icu/source/common/ucnvmbcs.c
--- icu.6001/source/common/ucnvmbcs.c	2009-06-11 13:22:41.000000000 +0100
+++ icu/source/common/ucnvmbcs.c	2009-06-11 13:30:06.000000000 +0100
@@ -625,8 +625,21 @@
                             /* Only add code points that map to ISO 2022 GR 94 DBCS codes (each byte A1..FE). */
                             do {
                                 if( ((st3&1)!=0 || useFallback) &&
-                                    (uint16_t)((value=*((const uint16_t *)stage3))-0xa1a1)<=(0xfefe-0xa1a1) &&
-                                    (uint8_t)(value-0xa1)<=(0xfe-0xa1)
+                                    (uint16_t)((value=*((const uint16_t *)stage3)) - 0xa1a1)<=(0xfefe - 0xa1a1) &&
+                                    (uint8_t)(value-0xa1)<=(0xfe - 0xa1)
+                                ) {
+                                    sa->add(sa->set, c);
+                                }
+                                st3>>=1;
+                                stage3+=2;  /* +=st3Multiplier */
+                            } while((++c&0xf)!=0);
+                            break;
+                        case UCNV_SET_FILTER_HZ:
+                            /* Only add code points that are suitable for HZ DBCS (lead byte A1..FD). */
+                            do {
+                                if( ((st3&1)!=0 || useFallback) &&
+                                    (uint16_t)((value=*((const uint16_t *)stage3))-0xa1a1)<=(0xfdfe - 0xa1a1) &&
+                                    (uint8_t)(value-0xa1)<=(0xfe - 0xa1)
                                 ) {
                                     sa->add(sa->set, c);
                                 }
diff -ru icu.6001/source/common/ucnvmbcs.h icu/source/common/ucnvmbcs.h
--- icu.6001/source/common/ucnvmbcs.h	2009-06-11 13:22:41.000000000 +0100
+++ icu/source/common/ucnvmbcs.h	2009-06-11 13:30:03.000000000 +0100
@@ -493,6 +493,7 @@
     UCNV_SET_FILTER_2022_CN,
     UCNV_SET_FILTER_SJIS,
     UCNV_SET_FILTER_GR94DBCS,
+    UCNV_SET_FILTER_HZ,
     UCNV_SET_FILTER_COUNT
 } UConverterSetFilter;
 
diff -ru icu.6001/source/test/cintltst/ncnvtst.c icu/source/test/cintltst/ncnvtst.c
--- icu.6001/source/test/cintltst/ncnvtst.c	2009-06-11 13:22:40.000000000 +0100
+++ icu/source/test/cintltst/ncnvtst.c	2009-06-11 13:30:03.000000000 +0100
@@ -1928,7 +1928,7 @@
 #if !UCONFIG_NO_LEGACY_CONVERSION
         { "UTF-8", 0, 0xd7ff, 0xe000, 0x10ffff, 0xd800, 0xdfff },
         { "windows-1251", 0, 0x7f, 0x410, 0x44f, 0x3000, 0xd7ff },
-        { "HZ", 0x410, 0x44f, 0x4e00, 0x4eff, 0xac00, 0xd7ff },
+        /* HZ test case fixed and moved to intltest's conversion.txt, ticket #6002 */
         { "shift-jis", 0x3041, 0x3093, 0x30a1, 0x30f3, 0x900, 0x1cff }
 #else
         { "UTF-8", 0, 0xd7ff, 0xe000, 0x10ffff, 0xd800, 0xdfff }
diff -ru icu.6001/source/test/intltest/convtest.cpp icu/source/test/intltest/convtest.cpp
--- icu.6001/source/test/intltest/convtest.cpp	2009-06-11 13:22:40.000000000 +0100
+++ icu/source/test/intltest/convtest.cpp	2009-06-11 13:30:03.000000000 +0100
@@ -538,7 +538,7 @@
         "Shift-JIS",
         "ibm-1390",  // EBCDIC_STATEFUL table
         "ibm-16684",  // DBCS-only extension table based on EBCDIC_STATEFUL table
-        // "HZ", TODO(markus): known bug, the set incorrectly contains [\u02CA\u02CB\u02D9\u2010\u2013\u2015...]
+        "HZ",
         "ISO-2022-JP",
         "JIS7",
         "ISO-2022-CN",
diff -ru icu.6001/source/test/testdata/conversion.txt icu/source/test/testdata/conversion.txt
--- icu.6001/source/test/testdata/conversion.txt	2009-06-11 13:22:40.000000000 +0100
+++ icu/source/test/testdata/conversion.txt	2009-06-11 13:30:03.000000000 +0100
@@ -48,6 +48,14 @@
     toUnicode {
       Headers { "charset", "bytes", "unicode", "offsets", "flush", "fallbacks", "errorCode", "callback", "invalidChars" }
       Cases {
+        // test that HZ limits its byte values to lead bytes 21..7d and trail bytes 21..7e
+        {
+          "HZ",
+          :bin{ 7e7b21212120217e217f772100007e217e7d207e7e807e0a2b },
+          "\u3000\ufffd\u3013\ufffd\u9ccc\ufffd\ufffd ~\ufffd+",
+          :intvector{ 2,4,6,8,10,12,14,18,19,21,24 },
+          :int{1}, :int{1}, "", "?", :bin{""}
+        }
         // improve coverage of ISO-2022-JP converter with hardcoded JIS X 0201 and
         // using the Shift-JIS table for JIS X 0208 (ticket #5797)
         {
@@ -1349,6 +1357,14 @@
           :int{0}
         }
 
+        // HZ
+        {
+          "HZ",
+          "[\u0410-\u044f\u4e00\u4e01\u4e03]",
+          "[\u4e02\u4e04-\u4e06\uac00-\ud7ff]",
+          :int{0}
+        }
+        
         // DBCS-only
         {
           "ibm-971",


Index: icu.spec
===================================================================
RCS file: /cvs/pkgs/rpms/icu/F-9/icu.spec,v
retrieving revision 1.72
retrieving revision 1.73
diff -u -p -r1.72 -r1.73
--- icu.spec	26 Aug 2008 12:59:56 -0000	1.72
+++ icu.spec	11 Jun 2009 19:28:45 -0000	1.73
@@ -1,6 +1,6 @@
 Name:      icu
 Version:   3.8.1
-Release:   8%{?dist}
+Release:   9%{?dist}
 Summary:   International Components for Unicode
 Group:     Development/Tools
 License:   MIT
@@ -18,6 +18,11 @@ Patch6: icu.icu5498.openoffice.org.patch
 Patch7: icu.regexp.patch
 Patch8: icu.icu6213.worstcase.patch
 Patch9: icu.icu6108.malayalam.samvruthokaram.patch
+Patch10: icu.icu5797.backport.patch
+Patch11: icu.icu6001.backport.patch
+Patch12: icu.icu6002.backport.patch
+Patch13: icu.icu6175.emptysegments.patch
+Patch14: icu.icu5691.backport.patch
 
 %description
 Tools and utilities for developing with icu.
@@ -64,6 +69,11 @@ Group:   Documentation
 %patch7 -p0 -b .regexp.patch
 %patch8 -p1 -b .icu6213.worstcase.patch
 %patch9 -p1 -b .icu6108.malayalam.samvruthokaram.patch
+%patch10 -p1 -b .icu5797.backport.patch
+%patch11 -p1 -b .icu6001.backport.patch
+%patch12 -p1 -b .icu6002.backport.patch
+%patch13 -p1 -b .icu6175.emptysegments.patch
+%patch14 -p1 -b .icu.icu5691.backport.patch
 
 %build
 cd source
@@ -141,6 +151,9 @@ rm -rf $RPM_BUILD_ROOT
 %doc source/__docs/%{name}/html/*
 
 %changelog
+* Thu Jun 11 2009 Caolan McNamara <caolanm at redhat.com> - 3.8.1-9
+- Resolves: rhbz#505368 CVE-2009-0153 Handle illegal sequences consistently
+
 * Tue Aug 26 2008 Caolan McNamara <caolanm at redhat.com> - 3.8.1-8
 - Resolves: rhbz#459698 drop Malayalam patches. Note test with
   multiple fonts and not just Lohit Malayalam before filing bugs against icu