/* Convert multibyte character to 32-bit wide character. Copyright (C) 2020-2023 Free Software Foundation, Inc. This file is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This file is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this program. If not, see . */ /* Written by Bruno Haible , 2020. */ #include /* Specification. */ #include #include "attribute.h" #include #include #if GL_CHAR32_T_IS_UNICODE # include "lc-charset-unicode.h" #endif #if GNULIB_defined_mbstate_t /* AIX, IRIX */ /* Implement mbrtoc32() on top of mbtowc() for the non-UTF-8 locales and directly for the UTF-8 locales. */ /* Note: On AIX (64-bit) we can implement mbrtoc32 in two equivalent ways: - in a way that parallels the override of mbrtowc; this is the code branch here; - in a way that invokes the overridden mbrtowc; this would be the #else branch below. They are equivalent. */ # if defined _WIN32 && !defined __CYGWIN__ # define WIN32_LEAN_AND_MEAN /* avoid including junk */ # include # elif HAVE_PTHREAD_API # include # if HAVE_THREADS_H && HAVE_WEAK_SYMBOLS # include # pragma weak thrd_exit # define c11_threads_in_use() (thrd_exit != NULL) # else # define c11_threads_in_use() 0 # endif # elif HAVE_THREADS_H # include # endif # include "lc-charset-dispatch.h" # include "mbtowc-lock.h" static_assert (sizeof (mbstate_t) >= 4); static char internal_state[4]; size_t mbrtoc32 (char32_t *pwc, const char *s, size_t n, mbstate_t *ps) { # define FITS_IN_CHAR_TYPE(wc) 1 # include "mbrtowc-impl.h" } #else /* glibc, macOS, FreeBSD, NetBSD, OpenBSD, HP-UX, Solaris, Cygwin, mingw, MSVC, Minix, Android */ /* Implement mbrtoc32() based on the original mbrtoc32() or on mbrtowc(). */ # include # include "localcharset.h" # include "streq.h" # if MBRTOC32_IN_C_LOCALE_MAYBE_EILSEQ # include "hard-locale.h" # include # endif static mbstate_t internal_state; size_t mbrtoc32 (char32_t *pwc, const char *s, size_t n, mbstate_t *ps) # undef mbrtoc32 { /* It's simpler to handle the case s == NULL upfront, than to worry about this case later, before every test of pwc and n. */ if (s == NULL) { pwc = NULL; s = ""; n = 1; } # if MBRTOC32_EMPTY_INPUT_BUG || _GL_SMALL_WCHAR_T if (n == 0) return (size_t) -2; # endif if (ps == NULL) ps = &internal_state; # if HAVE_WORKING_MBRTOC32 /* mbrtoc32() may produce different values for wc than mbrtowc(). Therefore use mbrtoc32(). */ # if defined _WIN32 && !defined __CYGWIN__ char32_t wc; size_t ret = mbrtoc32 (&wc, s, n, ps); if (ret < (size_t) -2 && pwc != NULL) *pwc = wc; # else size_t ret = mbrtoc32 (pwc, s, n, ps); # endif # if GNULIB_MBRTOC32_REGULAR /* Verify that mbrtoc32 is regular. */ if (ret < (size_t) -3 && ! mbsinit (ps)) /* This occurs on glibc 2.36. */ mbszero (ps); if (ret == (size_t) -3) abort (); # endif # if MBRTOC32_IN_C_LOCALE_MAYBE_EILSEQ if ((size_t) -2 <= ret && n != 0 && ! hard_locale (LC_CTYPE)) { if (pwc != NULL) *pwc = (unsigned char) *s; return 1; } # endif return ret; # elif _GL_SMALL_WCHAR_T /* Special-case all encodings that may produce wide character values > WCHAR_MAX. */ const char *encoding = locale_charset (); if (STREQ_OPT (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0, 0)) { /* Special-case the UTF-8 encoding. Assume that the wide-character encoding in a UTF-8 locale is UCS-2 or, equivalently, UTF-16. */ /* Here n > 0. */ char *pstate = (char *)ps; size_t nstate = pstate[0]; char buf[4]; const char *p; size_t m; int res; switch (nstate) { case 0: p = s; m = n; break; case 3: buf[2] = pstate[3]; FALLTHROUGH; case 2: buf[1] = pstate[2]; FALLTHROUGH; case 1: buf[0] = pstate[1]; p = buf; m = nstate; buf[m++] = s[0]; if (n >= 2 && m < 4) { buf[m++] = s[1]; if (n >= 3 && m < 4) buf[m++] = s[2]; } break; default: errno = EINVAL; return (size_t)(-1); } /* Here m > 0. */ { # define FITS_IN_CHAR_TYPE(wc) 1 # include "mbrtowc-impl-utf8.h" } success: if (nstate >= (res > 0 ? res : 1)) abort (); res -= nstate; /* Set *ps to an initial state. */ # if defined _WIN32 && !defined __CYGWIN__ /* Native Windows. */ /* MSVC defines 'mbstate_t' as an 8-byte struct; the first 4 bytes matter. On mingw, 'mbstate_t' is sometimes defined as 'int', sometimes defined as an 8-byte struct, of which the first 4 bytes matter. */ *(unsigned int *)pstate = 0; # elif defined __CYGWIN__ /* Cygwin defines 'mbstate_t' as an 8-byte struct; the first 4 bytes matter. */ ps->__count = 0; # else pstate[0] = 0; # endif return res; incomplete: { size_t k = nstate; /* Here 0 <= k < m < 4. */ pstate[++k] = s[0]; if (k < m) { pstate[++k] = s[1]; if (k < m) pstate[++k] = s[2]; } if (k != m) abort (); } pstate[0] = m; return (size_t)(-2); invalid: errno = EILSEQ; /* The conversion state is undefined, says POSIX. */ return (size_t)(-1); } else { wchar_t wc; size_t ret = mbrtowc (&wc, s, n, ps); if (ret < (size_t) -2 && pwc != NULL) *pwc = wc; return ret; } # else /* char32_t and wchar_t are equivalent. Use mbrtowc(). */ wchar_t wc; size_t ret = mbrtowc (&wc, s, n, ps); # if GNULIB_MBRTOC32_REGULAR /* Ensure that mbrtoc32 is regular. */ if (ret < (size_t) -2 && ! mbsinit (ps)) /* This occurs on glibc 2.12. */ mbszero (ps); # endif # if GL_CHAR32_T_IS_UNICODE && GL_CHAR32_T_VS_WCHAR_T_NEEDS_CONVERSION if (ret < (size_t) -2 && wc != 0) { wc = locale_encoding_to_unicode (wc); if (wc == 0) { ret = (size_t) -1; errno = EILSEQ; } } # endif if (ret < (size_t) -2 && pwc != NULL) *pwc = wc; return ret; # endif } #endif