Roland Mainz
2013-09-16 16:05:07 UTC
Hi!
----
Attached (as "astksh20130913_gb18030_mbinit_fixes001.diff.txt") is a
patch which fixes some issues related to shift state handling which
were found during testing with \u[hex] and \w[hex] ...
* Notes:
- The Euro symbol (Unicode codepoint 0x20ac) may or may not return
|true| for |iswalpha(wchar_euro)| ... this was the issue where the
"wchar.sh" test was tripping over with "zh_CN.GB18030" on Solaris
11/B145
----
Bye,
Roland
--
__ . . __
(o.\ \/ /.o) roland.mainz at nrubsig.org
\__\/\/__/ MPEG specialist, C&&JAVA&&Sun&&Unix programmer
/O /==\ O\ TEL +49 641 3992797
(;O/ \/ \O;)
-------------- next part --------------
diff -r -u build_i386_64bit_debug/src/cmd/ksh93/sh/string.c build_i386_64bit_debug_wcharfix/src/cmd/ksh93/sh/string.c
--- src/cmd/ksh93/sh/string.c 2013-09-12 18:00:21.000000000 +0200
+++ src/cmd/ksh93/sh/string.c 2013-09-16 17:49:39.214780585 +0200
@@ -339,7 +339,9 @@
if(!cp)
return((char*)0);
offset = staktell();
+#if SHOPT_MULTIBYTE
mbinit();
+#endif
state = ((c= mbchar(cp))==0);
#if SHOPT_MULTIBYTE
lc_unicode = quote=='u' ? 1 : quote=='U' ? 0 : !!(ast.locale.set & AST_LC_unicode);
@@ -349,7 +351,8 @@
quote = '\'';
if(isaletter(c) && (!lc_unicode || c<=0x7f))
{
- while((c=mbchar(cp)), isaname(c) && (!lc_unicode || c<=0x7f));
+ while((c=mbchar(cp)), isaname(c) && (!lc_unicode || c<=0x7f))
+ ;
if(c==0)
return((char*)string);
if(c=='=')
diff -r -u build_i386_64bit_debug/src/cmd/ksh93/tests/wchar.sh build_i386_64bit_debug_wcharfix/src/cmd/ksh93/tests/wchar.sh
--- src/cmd/ksh93/tests/wchar.sh 2013-09-12 18:12:23.000000000 +0200
+++ src/cmd/ksh93/tests/wchar.sh 2013-09-16 17:29:11.934026898 +0200
@@ -32,31 +32,45 @@
Command=${0##*/}
integer Errors=0
-locales="en_US.UTF-8 en_US.ISO-8859-15 zh_CN.GB18030"
-supported="C.UTF-8"
+typeset -a locales=(
+ 'en_US.UTF-8'
+ 'en_US.ISO8859-15'
+ 'zh_CN.GB18030'
+)
+typeset -a supported=( 'C.UTF-8' )
-for lc_all in $locales
+for lc_all in "${locales[@]}"
do if {
PATH=/bin:/usr/bin:$PATH locale -a | grep -w ${lc_all%.*} &&
LC_ALL=$lc_all PATH=/bin:/usr/bin:$PATH iconv -f ${lc_all#*.} -t UTF-8 </dev/null
} >/dev/null 2>&1
- then supported+=" $lc_all"
+ then supported+=( "$lc_all" )
else : "LC_ALL=$lc_all not supported" :
fi
done
-exp0=$'0000000 24 27 e2 82 ac 27 0a'
exp2=$'\'\\u[20ac]\''
exp1='$'$exp2
-for lc_all in $supported
-do
+for lc_all in "${supported[@]}" ; do
+
+# We need both cases here since locales may or may not handle the
+# Euro symbol as alphabetical symbol
+# $ ~/bin/ksh -c 'LC_ALL=en_US.UTF-8 ; [[ $(printf "\u[20ac]") == ~(E)[[:alpha:]] ]] && print isalpha || print isnotalpha'
+# isnotalpha
+# $ ~/bin/ksh -c 'LC_ALL=zh_CN.GB18030 ; [[ $(printf "\u[20ac]") == ~(E)[[:alpha:]] ]] && print isalpha || print isnotalpha'
+# isalpha
+if $SHELL -c 'LC_ALL='${lc_all}' ; [[ $(printf "\u[20ac]") == ~(E)[[:alpha:]] ]]' ; then
+ exp0=$'~(E)(0000000\ (e2\ 82\ ac)\ (0a))' # <euro>
+else
+ exp0=$'~(E)(0000000\ (24)\ (27)\ (e2\ 82\ ac)\ (27)\ (0a))' # $'<euro>'
+fi
got=$(LC_OPTIONS=nounicode $SHELL -c 'export LC_ALL='${lc_all}'; printf "%q\n" "$(printf "\u[20ac]")"' | iconv -f ${lc_all#*.} -t UTF-8 | od -tx1 | head -1)
-[[ $got == "$exp0" ]] || err_exit "${lc_all} nounicode FAILED -- locale probably not supported -- expected '$exp0', got '$got'"
+[[ $got == $exp0 ]] || err_exit "${lc_all} nounicode FAILED -- locale probably not supported -- expected '$exp0', got '$got'"
got=$(LC_OPTIONS=unicode $SHELL -c 'export LC_ALL='${lc_all}'; printf "%(nounicode)q\n" "$(printf "\u[20ac]")"' | iconv -f ${lc_all#*.} -t UTF-8 | od -tx1 | head -1)
-[[ $got == "$exp0" ]] || err_exit "${lc_all} (nounicode) FAILED -- locale probably not supported -- expected '$exp0', got '$got'"
+[[ $got == $exp0 ]] || err_exit "${lc_all} (nounicode) FAILED -- locale probably not supported -- expected '$exp0', got '$got'"
got=$(LC_OPTIONS=unicode $SHELL -c 'export LC_ALL='${lc_all}'; printf "%q\n" "$(printf "\u[20ac]")"')
[[ $got == "$exp1" || $got == "$exp2" ]] || err_exit "${lc_all} unicode FAILED -- expected $exp1, got $got"
diff -r -u build_i386_64bit_debug/src/lib/libast/string/utf32stowcs.c build_i386_64bit_debug_wcharfix/src/lib/libast/string/utf32stowcs.c
--- src/lib/libast/string/utf32stowcs.c 2013-09-11 19:35:00.000000000 +0200
+++ src/lib/libast/string/utf32stowcs.c 2013-09-14 13:38:25.560529198 +0200
@@ -41,6 +41,8 @@
if (ast.locale.set & AST_LC_utf8)
{
char tmp[UTF8_LEN_MAX+1];
+
+ mbinit();
for (i = 0; i < n; i++)
{
@@ -60,6 +62,10 @@
ast.mb_uc2wc = 0;
if (ast.mb_uc2wc == 0)
return -1;
+
+ /* Reset shift state */
+ (void)iconv(ast.mb_uc2wc, NULL, NULL, NULL, NULL);
+
if (n == 1)
{
char tmp_in[UTF8_LEN_MAX+1];
@@ -84,8 +90,13 @@
return -1;
#endif
}
- else if (mb2wc(wchar[0], tmp_out, outbuf - tmp_out) <= 0)
- return -1;
+ else
+ {
+ mbinit();
+
+ if (mb2wc(wchar[0], tmp_out, outbuf - tmp_out) <= 0)
+ return -1;
+ }
i = 1;
}
else
@@ -112,6 +123,8 @@
if (mbwide())
{
ssize_t len;
+
+ mbinit();
for (outbuf = outbuf_start; i < n && outbuf < inbuf; i++, outbuf += len)
if ((len = mb2wc(wchar[i], outbuf, inbuf - outbuf)) < 0)
diff -r -u build_i386_64bit_debug/src/lib/libast/string/wcstoutf32s.c build_i386_64bit_debug_wcharfix/src/lib/libast/string/wcstoutf32s.c
--- src/lib/libast/string/wcstoutf32s.c 2013-09-11 15:11:34.000000000 +0200
+++ src/lib/libast/string/wcstoutf32s.c 2013-09-14 13:37:59.191221921 +0200
@@ -34,12 +34,14 @@
ssize_t
wcstoutf32s(uint32_t* utf32, wchar_t* wchar, size_t n)
{
- size_t i;
- ssize_t res;
+ size_t i;
+ ssize_t res;
if (ast.locale.set & AST_LC_utf8)
{
char tmp[UTF8_LEN_MAX+1];
+
+ mbinit();
for (i = 0; i < n; i++)
{
@@ -63,6 +65,10 @@
ast.mb_wc2uc = 0;
if (ast.mb_wc2uc == 0)
return -1;
+
+ /* Reset shift state */
+ (void)iconv(ast.mb_wc2uc, NULL, NULL, NULL, NULL);
+
inbytesleft = n * mbmax();
outbytesleft = n * sizeof(uint32_t);
inbuf_start = oldof(0, char, (inbytesleft + 2) + outbytesleft, 0);
----
Attached (as "astksh20130913_gb18030_mbinit_fixes001.diff.txt") is a
patch which fixes some issues related to shift state handling which
were found during testing with \u[hex] and \w[hex] ...
* Notes:
- The Euro symbol (Unicode codepoint 0x20ac) may or may not return
|true| for |iswalpha(wchar_euro)| ... this was the issue where the
"wchar.sh" test was tripping over with "zh_CN.GB18030" on Solaris
11/B145
----
Bye,
Roland
--
__ . . __
(o.\ \/ /.o) roland.mainz at nrubsig.org
\__\/\/__/ MPEG specialist, C&&JAVA&&Sun&&Unix programmer
/O /==\ O\ TEL +49 641 3992797
(;O/ \/ \O;)
-------------- next part --------------
diff -r -u build_i386_64bit_debug/src/cmd/ksh93/sh/string.c build_i386_64bit_debug_wcharfix/src/cmd/ksh93/sh/string.c
--- src/cmd/ksh93/sh/string.c 2013-09-12 18:00:21.000000000 +0200
+++ src/cmd/ksh93/sh/string.c 2013-09-16 17:49:39.214780585 +0200
@@ -339,7 +339,9 @@
if(!cp)
return((char*)0);
offset = staktell();
+#if SHOPT_MULTIBYTE
mbinit();
+#endif
state = ((c= mbchar(cp))==0);
#if SHOPT_MULTIBYTE
lc_unicode = quote=='u' ? 1 : quote=='U' ? 0 : !!(ast.locale.set & AST_LC_unicode);
@@ -349,7 +351,8 @@
quote = '\'';
if(isaletter(c) && (!lc_unicode || c<=0x7f))
{
- while((c=mbchar(cp)), isaname(c) && (!lc_unicode || c<=0x7f));
+ while((c=mbchar(cp)), isaname(c) && (!lc_unicode || c<=0x7f))
+ ;
if(c==0)
return((char*)string);
if(c=='=')
diff -r -u build_i386_64bit_debug/src/cmd/ksh93/tests/wchar.sh build_i386_64bit_debug_wcharfix/src/cmd/ksh93/tests/wchar.sh
--- src/cmd/ksh93/tests/wchar.sh 2013-09-12 18:12:23.000000000 +0200
+++ src/cmd/ksh93/tests/wchar.sh 2013-09-16 17:29:11.934026898 +0200
@@ -32,31 +32,45 @@
Command=${0##*/}
integer Errors=0
-locales="en_US.UTF-8 en_US.ISO-8859-15 zh_CN.GB18030"
-supported="C.UTF-8"
+typeset -a locales=(
+ 'en_US.UTF-8'
+ 'en_US.ISO8859-15'
+ 'zh_CN.GB18030'
+)
+typeset -a supported=( 'C.UTF-8' )
-for lc_all in $locales
+for lc_all in "${locales[@]}"
do if {
PATH=/bin:/usr/bin:$PATH locale -a | grep -w ${lc_all%.*} &&
LC_ALL=$lc_all PATH=/bin:/usr/bin:$PATH iconv -f ${lc_all#*.} -t UTF-8 </dev/null
} >/dev/null 2>&1
- then supported+=" $lc_all"
+ then supported+=( "$lc_all" )
else : "LC_ALL=$lc_all not supported" :
fi
done
-exp0=$'0000000 24 27 e2 82 ac 27 0a'
exp2=$'\'\\u[20ac]\''
exp1='$'$exp2
-for lc_all in $supported
-do
+for lc_all in "${supported[@]}" ; do
+
+# We need both cases here since locales may or may not handle the
+# Euro symbol as alphabetical symbol
+# $ ~/bin/ksh -c 'LC_ALL=en_US.UTF-8 ; [[ $(printf "\u[20ac]") == ~(E)[[:alpha:]] ]] && print isalpha || print isnotalpha'
+# isnotalpha
+# $ ~/bin/ksh -c 'LC_ALL=zh_CN.GB18030 ; [[ $(printf "\u[20ac]") == ~(E)[[:alpha:]] ]] && print isalpha || print isnotalpha'
+# isalpha
+if $SHELL -c 'LC_ALL='${lc_all}' ; [[ $(printf "\u[20ac]") == ~(E)[[:alpha:]] ]]' ; then
+ exp0=$'~(E)(0000000\ (e2\ 82\ ac)\ (0a))' # <euro>
+else
+ exp0=$'~(E)(0000000\ (24)\ (27)\ (e2\ 82\ ac)\ (27)\ (0a))' # $'<euro>'
+fi
got=$(LC_OPTIONS=nounicode $SHELL -c 'export LC_ALL='${lc_all}'; printf "%q\n" "$(printf "\u[20ac]")"' | iconv -f ${lc_all#*.} -t UTF-8 | od -tx1 | head -1)
-[[ $got == "$exp0" ]] || err_exit "${lc_all} nounicode FAILED -- locale probably not supported -- expected '$exp0', got '$got'"
+[[ $got == $exp0 ]] || err_exit "${lc_all} nounicode FAILED -- locale probably not supported -- expected '$exp0', got '$got'"
got=$(LC_OPTIONS=unicode $SHELL -c 'export LC_ALL='${lc_all}'; printf "%(nounicode)q\n" "$(printf "\u[20ac]")"' | iconv -f ${lc_all#*.} -t UTF-8 | od -tx1 | head -1)
-[[ $got == "$exp0" ]] || err_exit "${lc_all} (nounicode) FAILED -- locale probably not supported -- expected '$exp0', got '$got'"
+[[ $got == $exp0 ]] || err_exit "${lc_all} (nounicode) FAILED -- locale probably not supported -- expected '$exp0', got '$got'"
got=$(LC_OPTIONS=unicode $SHELL -c 'export LC_ALL='${lc_all}'; printf "%q\n" "$(printf "\u[20ac]")"')
[[ $got == "$exp1" || $got == "$exp2" ]] || err_exit "${lc_all} unicode FAILED -- expected $exp1, got $got"
diff -r -u build_i386_64bit_debug/src/lib/libast/string/utf32stowcs.c build_i386_64bit_debug_wcharfix/src/lib/libast/string/utf32stowcs.c
--- src/lib/libast/string/utf32stowcs.c 2013-09-11 19:35:00.000000000 +0200
+++ src/lib/libast/string/utf32stowcs.c 2013-09-14 13:38:25.560529198 +0200
@@ -41,6 +41,8 @@
if (ast.locale.set & AST_LC_utf8)
{
char tmp[UTF8_LEN_MAX+1];
+
+ mbinit();
for (i = 0; i < n; i++)
{
@@ -60,6 +62,10 @@
ast.mb_uc2wc = 0;
if (ast.mb_uc2wc == 0)
return -1;
+
+ /* Reset shift state */
+ (void)iconv(ast.mb_uc2wc, NULL, NULL, NULL, NULL);
+
if (n == 1)
{
char tmp_in[UTF8_LEN_MAX+1];
@@ -84,8 +90,13 @@
return -1;
#endif
}
- else if (mb2wc(wchar[0], tmp_out, outbuf - tmp_out) <= 0)
- return -1;
+ else
+ {
+ mbinit();
+
+ if (mb2wc(wchar[0], tmp_out, outbuf - tmp_out) <= 0)
+ return -1;
+ }
i = 1;
}
else
@@ -112,6 +123,8 @@
if (mbwide())
{
ssize_t len;
+
+ mbinit();
for (outbuf = outbuf_start; i < n && outbuf < inbuf; i++, outbuf += len)
if ((len = mb2wc(wchar[i], outbuf, inbuf - outbuf)) < 0)
diff -r -u build_i386_64bit_debug/src/lib/libast/string/wcstoutf32s.c build_i386_64bit_debug_wcharfix/src/lib/libast/string/wcstoutf32s.c
--- src/lib/libast/string/wcstoutf32s.c 2013-09-11 15:11:34.000000000 +0200
+++ src/lib/libast/string/wcstoutf32s.c 2013-09-14 13:37:59.191221921 +0200
@@ -34,12 +34,14 @@
ssize_t
wcstoutf32s(uint32_t* utf32, wchar_t* wchar, size_t n)
{
- size_t i;
- ssize_t res;
+ size_t i;
+ ssize_t res;
if (ast.locale.set & AST_LC_utf8)
{
char tmp[UTF8_LEN_MAX+1];
+
+ mbinit();
for (i = 0; i < n; i++)
{
@@ -63,6 +65,10 @@
ast.mb_wc2uc = 0;
if (ast.mb_wc2uc == 0)
return -1;
+
+ /* Reset shift state */
+ (void)iconv(ast.mb_wc2uc, NULL, NULL, NULL, NULL);
+
inbytesleft = n * mbmax();
outbytesleft = n * sizeof(uint32_t);
inbuf_start = oldof(0, char, (inbytesleft + 2) + outbytesleft, 0);