The problem is: currently our single byte ctype(3) functions are broken

for wide characters locales in the argument range >= 0x80 - they may
return false positives.

Example 1: for UTF-8 locale we currently have:
iswspace(0xA0)==1 and isspace(0xA0)==1
(because iswspace() and isspace() are the same code)
but must have
iswspace(0xA0)==1 and isspace(0xA0)==0
(because there is no such character and all others in the range
0x80..0xff for the UTF-8 locale, it keeps ASCII only in the single byte
range because our internal wchar_t representation for UTF-8 is UCS-4).

Example 2: for all wide character locales isalpha(arg) when arg > 0xFF may
return false positives (must be 0).
(because iswalpha() and isalpha() are the same code)

This change address this issue separating single byte and wide ctype
and also fix iswascii() (currently iswascii() is broken for
arguments > 0xFF).
This change is 100% binary compatible with old binaries.

Reviewied by: i18n@
This commit is contained in:
Andrey A. Chernov 2007-10-13 16:28:22 +00:00
parent b46286393c
commit 367ed4e13d
15 changed files with 113 additions and 44 deletions

View File

@ -87,6 +87,8 @@ __END_DECLS
#define __inline
#endif
extern int __mb_sb_limit;
/*
* Use inline functions if we are allowed to and the compiler supports them.
*/
@ -102,16 +104,29 @@ __maskrune(__ct_rune_t _c, unsigned long _f)
_CurrentRuneLocale->__runetype[_c]) & _f;
}
static __inline int
__sbmaskrune(__ct_rune_t _c, unsigned long _f)
{
return (_c < 0 || _c >= __mb_sb_limit) ? 0 :
_CurrentRuneLocale->__runetype[_c] & _f;
}
static __inline int
__istype(__ct_rune_t _c, unsigned long _f)
{
return (!!__maskrune(_c, _f));
}
static __inline int
__sbistype(__ct_rune_t _c, unsigned long _f)
{
return (!!__sbmaskrune(_c, _f));
}
static __inline int
__isctype(__ct_rune_t _c, unsigned long _f)
{
return (_c < 0 || _c >= _CACHED_RUNES) ? 0 :
return (_c < 0 || _c >= __mb_sb_limit) ? 0 :
!!(_DefaultRuneLocale.__runetype[_c] & _f);
}
@ -122,6 +137,13 @@ __toupper(__ct_rune_t _c)
_CurrentRuneLocale->__mapupper[_c];
}
static __inline __ct_rune_t
__sbtoupper(__ct_rune_t _c)
{
return (_c < 0 || _c >= __mb_sb_limit) ? _c :
_CurrentRuneLocale->__mapupper[_c];
}
static __inline __ct_rune_t
__tolower(__ct_rune_t _c)
{
@ -129,6 +151,13 @@ __tolower(__ct_rune_t _c)
_CurrentRuneLocale->__maplower[_c];
}
static __inline __ct_rune_t
__sbtolower(__ct_rune_t _c)
{
return (_c < 0 || _c >= __mb_sb_limit) ? _c :
_CurrentRuneLocale->__maplower[_c];
}
static __inline int
__wcwidth(__ct_rune_t _c)
{
@ -146,10 +175,14 @@ __wcwidth(__ct_rune_t _c)
__BEGIN_DECLS
int __maskrune(__ct_rune_t, unsigned long);
int __sbmaskrune(__ct_rune_t, unsigned long);
int __istype(__ct_rune_t, unsigned long);
int __sbistype(__ct_rune_t, unsigned long);
int __isctype(__ct_rune_t, unsigned long);
__ct_rune_t __toupper(__ct_rune_t);
__ct_rune_t __sbtoupper(__ct_rune_t);
__ct_rune_t __tolower(__ct_rune_t);
__ct_rune_t __sbtolower(__ct_rune_t);
int __wcwidth(__ct_rune_t);
__END_DECLS
#endif /* using inlines */

View File

@ -86,19 +86,19 @@ int isspecial(int);
#endif
__END_DECLS
#define isalnum(c) __istype((c), _CTYPE_A|_CTYPE_D)
#define isalpha(c) __istype((c), _CTYPE_A)
#define iscntrl(c) __istype((c), _CTYPE_C)
#define isalnum(c) __sbistype((c), _CTYPE_A|_CTYPE_D)
#define isalpha(c) __sbistype((c), _CTYPE_A)
#define iscntrl(c) __sbistype((c), _CTYPE_C)
#define isdigit(c) __isctype((c), _CTYPE_D) /* ANSI -- locale independent */
#define isgraph(c) __istype((c), _CTYPE_G)
#define islower(c) __istype((c), _CTYPE_L)
#define isprint(c) __istype((c), _CTYPE_R)
#define ispunct(c) __istype((c), _CTYPE_P)
#define isspace(c) __istype((c), _CTYPE_S)
#define isupper(c) __istype((c), _CTYPE_U)
#define isgraph(c) __sbistype((c), _CTYPE_G)
#define islower(c) __sbistype((c), _CTYPE_L)
#define isprint(c) __sbistype((c), _CTYPE_R)
#define ispunct(c) __sbistype((c), _CTYPE_P)
#define isspace(c) __sbistype((c), _CTYPE_S)
#define isupper(c) __sbistype((c), _CTYPE_U)
#define isxdigit(c) __isctype((c), _CTYPE_X) /* ANSI -- locale independent */
#define tolower(c) __tolower(c)
#define toupper(c) __toupper(c)
#define tolower(c) __sbtolower(c)
#define toupper(c) __sbtoupper(c)
#if __XSI_VISIBLE
/*
@ -112,24 +112,24 @@ __END_DECLS
*
* XXX isascii() and toascii() should similarly be undocumented.
*/
#define _tolower(c) __tolower(c)
#define _toupper(c) __toupper(c)
#define _tolower(c) __sbtolower(c)
#define _toupper(c) __sbtoupper(c)
#define isascii(c) (((c) & ~0x7F) == 0)
#define toascii(c) ((c) & 0x7F)
#endif
#if __ISO_C_VISIBLE >= 1999
#define isblank(c) __istype((c), _CTYPE_B)
#define isblank(c) __sbistype((c), _CTYPE_B)
#endif
#if __BSD_VISIBLE
#define digittoint(c) __maskrune((c), 0xFF)
#define ishexnumber(c) __istype((c), _CTYPE_X)
#define isideogram(c) __istype((c), _CTYPE_I)
#define isnumber(c) __istype((c), _CTYPE_D)
#define isphonogram(c) __istype((c), _CTYPE_Q)
#define isrune(c) __istype((c), 0xFFFFFF00L)
#define isspecial(c) __istype((c), _CTYPE_T)
#define digittoint(c) __sbmaskrune((c), 0xFF)
#define ishexnumber(c) __sbistype((c), _CTYPE_X)
#define isideogram(c) __sbistype((c), _CTYPE_I)
#define isnumber(c) __sbistype((c), _CTYPE_D)
#define isphonogram(c) __sbistype((c), _CTYPE_Q)
#define isrune(c) __sbistype((c), 0xFFFFFF00L)
#define isspecial(c) __sbistype((c), _CTYPE_T)
#endif
#endif /* !_CTYPE_H_ */

View File

@ -106,7 +106,7 @@ __END_DECLS
#define towupper(wc) __toupper(wc)
#if __BSD_VISIBLE
#define iswascii(wc) (((wc) & ~0x7F) == 0)
#define iswascii(wc) ((wc) < 0x80)
#define iswhexnumber(wc) __istype((wc), _CTYPE_X)
#define iswideogram(wc) __istype((wc), _CTYPE_I)
#define iswnumber(wc) __istype((wc), _CTYPE_D)

View File

@ -60,12 +60,17 @@ FBSD_1.0 {
nextwctype;
nl_langinfo;
__maskrune;
__sbmaskrune;
__istype;
__sbistype;
__isctype;
__toupper;
__sbtoupper;
__tolower;
__sbtolower;
__wcwidth;
__mb_cur_max;
__mb_sb_limit;
rpmatch;
___runetype;
setlocale;

View File

@ -49,6 +49,8 @@ __FBSDID("$FreeBSD$");
#include <wchar.h>
#include "mblocal.h"
extern int __mb_sb_limit;
static size_t _BIG5_mbrtowc(wchar_t * __restrict, const char * __restrict,
size_t, mbstate_t * __restrict);
static int _BIG5_mbsinit(const mbstate_t *);
@ -68,6 +70,7 @@ _BIG5_init(_RuneLocale *rl)
__mbsinit = _BIG5_mbsinit;
_CurrentRuneLocale = rl;
__mb_cur_max = 2;
__mb_sb_limit = 128;
return (0);
}

View File

@ -49,6 +49,8 @@ __FBSDID("$FreeBSD$");
#include <wchar.h>
#include "mblocal.h"
extern int __mb_sb_limit;
static size_t _EUC_mbrtowc(wchar_t * __restrict, const char * __restrict,
size_t, mbstate_t * __restrict);
static int _EUC_mbsinit(const mbstate_t *);
@ -116,6 +118,7 @@ _EUC_init(_RuneLocale *rl)
__mbrtowc = _EUC_mbrtowc;
__wcrtomb = _EUC_wcrtomb;
__mbsinit = _EUC_mbsinit;
__mb_sb_limit = 256;
return (0);
}

View File

@ -39,6 +39,8 @@ __FBSDID("$FreeBSD$");
#include <wchar.h>
#include "mblocal.h"
extern int __mb_sb_limit;
static size_t _GB18030_mbrtowc(wchar_t * __restrict, const char * __restrict,
size_t, mbstate_t * __restrict);
static int _GB18030_mbsinit(const mbstate_t *);
@ -59,6 +61,7 @@ _GB18030_init(_RuneLocale *rl)
__mbsinit = _GB18030_mbsinit;
_CurrentRuneLocale = rl;
__mb_cur_max = 4;
__mb_sb_limit = 128;
return (0);
}

View File

@ -35,6 +35,8 @@ __FBSDID("$FreeBSD$");
#include <wchar.h>
#include "mblocal.h"
extern int __mb_sb_limit;
static size_t _GB2312_mbrtowc(wchar_t * __restrict, const char * __restrict,
size_t, mbstate_t * __restrict);
static int _GB2312_mbsinit(const mbstate_t *);
@ -55,6 +57,7 @@ _GB2312_init(_RuneLocale *rl)
__wcrtomb = _GB2312_wcrtomb;
__mbsinit = _GB2312_mbsinit;
__mb_cur_max = 2;
__mb_sb_limit = 128;
return (0);
}

View File

@ -42,6 +42,8 @@ __FBSDID("$FreeBSD$");
#include <wchar.h>
#include "mblocal.h"
extern int __mb_sb_limit;
static size_t _GBK_mbrtowc(wchar_t * __restrict, const char * __restrict,
size_t, mbstate_t * __restrict);
static int _GBK_mbsinit(const mbstate_t *);
@ -61,6 +63,7 @@ _GBK_init(_RuneLocale *rl)
__mbsinit = _GBK_mbsinit;
_CurrentRuneLocale = rl;
__mb_cur_max = 2;
__mb_sb_limit = 128;
return (0);
}

View File

@ -48,7 +48,7 @@ int
digittoint(c)
int c;
{
return (__maskrune(c, 0xFF));
return (__sbmaskrune(c, 0xFF));
}
#undef isalnum
@ -56,7 +56,7 @@ int
isalnum(c)
int c;
{
return (__istype(c, _CTYPE_A|_CTYPE_D));
return (__sbistype(c, _CTYPE_A|_CTYPE_D));
}
#undef isalpha
@ -64,7 +64,7 @@ int
isalpha(c)
int c;
{
return (__istype(c, _CTYPE_A));
return (__sbistype(c, _CTYPE_A));
}
#undef isascii
@ -80,7 +80,7 @@ int
isblank(c)
int c;
{
return (__istype(c, _CTYPE_B));
return (__sbistype(c, _CTYPE_B));
}
#undef iscntrl
@ -88,7 +88,7 @@ int
iscntrl(c)
int c;
{
return (__istype(c, _CTYPE_C));
return (__sbistype(c, _CTYPE_C));
}
#undef isdigit
@ -104,7 +104,7 @@ int
isgraph(c)
int c;
{
return (__istype(c, _CTYPE_G));
return (__sbistype(c, _CTYPE_G));
}
#undef ishexnumber
@ -112,7 +112,7 @@ int
ishexnumber(c)
int c;
{
return (__istype(c, _CTYPE_X));
return (__sbistype(c, _CTYPE_X));
}
#undef isideogram
@ -120,7 +120,7 @@ int
isideogram(c)
int c;
{
return (__istype(c, _CTYPE_I));
return (__sbistype(c, _CTYPE_I));
}
#undef islower
@ -128,7 +128,7 @@ int
islower(c)
int c;
{
return (__istype(c, _CTYPE_L));
return (__sbistype(c, _CTYPE_L));
}
#undef isnumber
@ -136,7 +136,7 @@ int
isnumber(c)
int c;
{
return (__istype(c, _CTYPE_D));
return (__sbistype(c, _CTYPE_D));
}
#undef isphonogram
@ -144,7 +144,7 @@ int
isphonogram(c)
int c;
{
return (__istype(c, _CTYPE_Q));
return (__sbistype(c, _CTYPE_Q));
}
#undef isprint
@ -152,7 +152,7 @@ int
isprint(c)
int c;
{
return (__istype(c, _CTYPE_R));
return (__sbistype(c, _CTYPE_R));
}
#undef ispunct
@ -160,7 +160,7 @@ int
ispunct(c)
int c;
{
return (__istype(c, _CTYPE_P));
return (__sbistype(c, _CTYPE_P));
}
#undef isrune
@ -168,7 +168,7 @@ int
isrune(c)
int c;
{
return (__istype(c, 0xFFFFFF00L));
return (__sbistype(c, 0xFFFFFF00L));
}
#undef isspace
@ -176,7 +176,7 @@ int
isspace(c)
int c;
{
return (__istype(c, _CTYPE_S));
return (__sbistype(c, _CTYPE_S));
}
#undef isspecial
@ -184,7 +184,7 @@ int
isspecial(c)
int c;
{
return (__istype(c, _CTYPE_T));
return (__sbistype(c, _CTYPE_T));
}
#undef isupper
@ -192,7 +192,7 @@ int
isupper(c)
int c;
{
return (__istype(c, _CTYPE_U));
return (__sbistype(c, _CTYPE_U));
}
#undef isxdigit
@ -216,7 +216,7 @@ int
tolower(c)
int c;
{
return (__tolower(c));
return (__sbtolower(c));
}
#undef toupper
@ -224,6 +224,6 @@ int
toupper(c)
int c;
{
return (__toupper(c));
return (__sbtoupper(c));
}

View File

@ -61,7 +61,7 @@ int
iswascii(wc)
wint_t wc;
{
return ((wc & ~0x7F) == 0);
return (wc < 0x80);
}
#undef iswblank

View File

@ -47,6 +47,8 @@ __FBSDID("$FreeBSD$");
#include <wchar.h>
#include "mblocal.h"
extern int __mb_sb_limit;
static size_t _MSKanji_mbrtowc(wchar_t * __restrict, const char * __restrict,
size_t, mbstate_t * __restrict);
static int _MSKanji_mbsinit(const mbstate_t *);
@ -66,6 +68,7 @@ _MSKanji_init(_RuneLocale *rl)
__mbsinit = _MSKanji_mbsinit;
_CurrentRuneLocale = rl;
__mb_cur_max = 2;
__mb_sb_limit = 256;
return (0);
}

View File

@ -58,6 +58,11 @@ static size_t _none_wcrtomb(char * __restrict, wchar_t,
static size_t _none_wcsnrtombs(char * __restrict, const wchar_t ** __restrict,
size_t, size_t, mbstate_t * __restrict);
/* setup defaults */
int __mb_cur_max = 1;
int __mb_sb_limit = 256; /* Expected to be <= _CACHED_RUNES */
int
_none_init(_RuneLocale *rl)
{
@ -69,6 +74,7 @@ _none_init(_RuneLocale *rl)
__wcsnrtombs = _none_wcsnrtombs;
_CurrentRuneLocale = rl;
__mb_cur_max = 1;
__mb_sb_limit = 256;
return(0);
}
@ -176,7 +182,6 @@ _none_wcsnrtombs(char * __restrict dst, const wchar_t ** __restrict src,
/* setup defaults */
int __mb_cur_max = 1;
size_t (*__mbrtowc)(wchar_t * __restrict, const char * __restrict, size_t,
mbstate_t * __restrict) = _none_mbrtowc;
int (*__mbsinit)(const mbstate_t *) = _none_mbsinit;

View File

@ -45,6 +45,8 @@ __FBSDID("$FreeBSD$");
#include "mblocal.h"
#include "setlocale.h"
extern int __mb_sb_limit;
extern _RuneLocale *_Read_RuneMagi(FILE *);
static int __setrunelocale(const char *);
@ -59,6 +61,7 @@ __setrunelocale(const char *encoding)
static char ctype_encoding[ENCODING_LEN + 1];
static _RuneLocale *CachedRuneLocale;
static int Cached__mb_cur_max;
static int Cached__mb_sb_limit;
static size_t (*Cached__mbrtowc)(wchar_t * __restrict,
const char * __restrict, size_t, mbstate_t * __restrict);
static size_t (*Cached__wcrtomb)(char * __restrict, wchar_t,
@ -85,6 +88,7 @@ __setrunelocale(const char *encoding)
strcmp(encoding, ctype_encoding) == 0) {
_CurrentRuneLocale = CachedRuneLocale;
__mb_cur_max = Cached__mb_cur_max;
__mb_sb_limit = Cached__mb_sb_limit;
__mbrtowc = Cached__mbrtowc;
__mbsinit = Cached__mbsinit;
__mbsnrtowcs = Cached__mbsnrtowcs;
@ -147,6 +151,7 @@ __setrunelocale(const char *encoding)
}
CachedRuneLocale = _CurrentRuneLocale;
Cached__mb_cur_max = __mb_cur_max;
Cached__mb_sb_limit = __mb_sb_limit;
Cached__mbrtowc = __mbrtowc;
Cached__mbsinit = __mbsinit;
Cached__mbsnrtowcs = __mbsnrtowcs;

View File

@ -35,6 +35,8 @@ __FBSDID("$FreeBSD$");
#include <wchar.h>
#include "mblocal.h"
extern int __mb_sb_limit;
static size_t _UTF8_mbrtowc(wchar_t * __restrict, const char * __restrict,
size_t, mbstate_t * __restrict);
static int _UTF8_mbsinit(const mbstate_t *);
@ -63,6 +65,7 @@ _UTF8_init(_RuneLocale *rl)
__wcsnrtombs = _UTF8_wcsnrtombs;
_CurrentRuneLocale = rl;
__mb_cur_max = 6;
__mb_sb_limit = 128;
return (0);
}