From f41964fcab0d34c510aa0e6a2a4471b6770937b3 Mon Sep 17 00:00:00 2001 From: Jonas 'Sortie' Termansen Date: Thu, 17 Apr 2014 17:41:37 +0200 Subject: [PATCH] Reimplement wchar conversion API. --- libc/Makefile | 3 + libc/include/stdlib.h | 5 +- libc/include/wchar.h | 22 ++--- libc/stdlib/mblen.cpp | 18 ++-- libc/stdlib/mbstowcs.cpp | 18 ++-- libc/stdlib/mbtowc.cpp | 16 +++- libc/stdlib/wcstombs.cpp | 15 ++-- libc/stdlib/wctomb.cpp | 13 ++- libc/wchar/mbrlen.cpp | 59 +------------ libc/wchar/mbrtowc.cpp | 177 +++++++++++++++++++++++--------------- libc/wchar/mbsinit.cpp | 30 +++++++ libc/wchar/mbsnrtowcs.cpp | 82 ++++++++++++++++++ libc/wchar/mbsrtowcs.cpp | 49 ++--------- libc/wchar/wcrtomb.cpp | 115 ++++++++++++++++--------- libc/wchar/wcsnrtombs.cpp | 87 +++++++++++++++++++ libc/wchar/wcsrtombs.cpp | 57 ++---------- 16 files changed, 463 insertions(+), 303 deletions(-) create mode 100644 libc/wchar/mbsinit.cpp create mode 100644 libc/wchar/mbsnrtowcs.cpp create mode 100644 libc/wchar/wcsnrtombs.cpp diff --git a/libc/Makefile b/libc/Makefile index c8b27f9c..82557455 100644 --- a/libc/Makefile +++ b/libc/Makefile @@ -238,6 +238,8 @@ time/strftime.o \ time/timegm.o \ wchar/mbrlen.o \ wchar/mbrtowc.o \ +wchar/mbsinit.o \ +wchar/mbsnrtowcs.o \ wchar/mbsrtowcs.o \ wchar/wcrtomb.o \ wchar/wcscat.o \ @@ -251,6 +253,7 @@ wchar/wcslen.o \ wchar/wcsncat.o \ wchar/wcsncmp.o \ wchar/wcsncpy.o \ +wchar/wcsnrtombs.o \ wchar/wcspbrk.o \ wchar/wcsrchr.o \ wchar/wcsrtombs.o \ diff --git a/libc/include/stdlib.h b/libc/include/stdlib.h index f2d38f3a..f138332c 100644 --- a/libc/include/stdlib.h +++ b/libc/include/stdlib.h @@ -1,6 +1,6 @@ /******************************************************************************* - Copyright(C) Jonas 'Sortie' Termansen 2011, 2012, 2013. + Copyright(C) Jonas 'Sortie' Termansen 2011, 2012, 2013, 2014. This file is part of the Sortix C Library. @@ -39,8 +39,7 @@ __BEGIN_DECLS /* TODO: This random interface is stupid. What should a good value be? */ #define RAND_MAX 32767 -/* TODO: This is just a value. It's not a compile time constant! */ -#define MB_CUR_MAX 16 +#define MB_CUR_MAX 6 typedef struct { diff --git a/libc/include/wchar.h b/libc/include/wchar.h index b092a4e8..0f7f7fb9 100644 --- a/libc/include/wchar.h +++ b/libc/include/wchar.h @@ -105,12 +105,15 @@ typedef __wint_t wint_t; /* Conversion state information. */ typedef struct { - int __count; - union - { - wint_t __wch; - char __wchb[4]; - } __value; /* Value so far. */ +#if defined(__is_sortix_libc) + unsigned short count; + unsigned short length; + wint_t wch; +#else + unsigned short __count; + unsigned short __length; + wint_t __wch; +#endif } mbstate_t; #define __mbstate_t_defined 1 #endif @@ -126,12 +129,11 @@ struct tm; /* TODO: wint_t getwchar(void); */ size_t mbrlen(const char* __restrict, size_t, mbstate_t* __restrict); size_t mbrtowc(wchar_t* __restrict, const char* __restrict, size_t, mbstate_t* __restrict); -/* TODO: int mbsinit(const mbstate_t*); */ +int mbsinit(const mbstate_t*); size_t mbsrtowcs(wchar_t* __restrict, const char** __restrict, size_t, mbstate_t* __restrict); /* TODO: wint_t putwc(wchar_t, FILE*); */ /* TODO: wint_t putwchar(wchar_t); */ /* TODO: wint_t ungetwc(wint_t, FILE*); */ - size_t wcrtomb(char* __restrict, wchar_t, mbstate_t* __restrict); wchar_t* wcscat(wchar_t* __restrict, const wchar_t* __restrict); wchar_t* wcschr(const wchar_t*, wchar_t); @@ -193,7 +195,7 @@ int wcwidth(wchar_t); /* Functions from POSIX 2008. */ #if __USE_SORTIX || 200809L <= __USE_POSIX -/* TODO: size_t mbsnrtowcs(wchar_t* __restrict, const char** __restrict, size_t, size_t, mbstate_t* __restrict); */ +size_t mbsnrtowcs(wchar_t* __restrict, const char** __restrict, size_t, size_t, mbstate_t* __restrict); /* TODO: FILE* open_wmemstream(wchar_t**, size_t*); */ /* TODO: wchar_t* wcpcpy(wchar_t* __restrict, const wchar_t* __restrict); */ /* TODO: wchar_t* wcpncpy(wchar_t* __restrict, const wchar_t* __restrict, size_t); */ @@ -204,7 +206,7 @@ int wcwidth(wchar_t); /* TODO: int wcsncasecmp(const wchar_t*, const wchar_t *, size_t); */ /* TODO: int wcsncasecmp_l(const wchar_t*, const wchar_t *, size_t, locale_t); */ /* TODO: size_t wcsnlen(const wchar_t*, size_t); */ -/* TODO: size_t wcsnrtombs(char* __restrict, const wchar_t** __restrict, size_t, size_t, mbstate_t* __restrict); */ +size_t wcsnrtombs(char* __restrict, const wchar_t** __restrict, size_t, size_t, mbstate_t* __restrict); /* TODO: size_t wcsxfrm_l(wchar_t* __restrict, const wchar_t* __restrict, size_t, locale_t); */ #endif diff --git a/libc/stdlib/mblen.cpp b/libc/stdlib/mblen.cpp index c4378cbd..ac88ab2d 100644 --- a/libc/stdlib/mblen.cpp +++ b/libc/stdlib/mblen.cpp @@ -1,6 +1,6 @@ /******************************************************************************* - Copyright(C) Jonas 'Sortie' Termansen 2013. + Copyright(C) Jonas 'Sortie' Termansen 2013, 2014. This file is part of the Sortix C Library. @@ -26,18 +26,18 @@ #include #include +// TODO: This function is unpure and should be removed. extern "C" int mblen(const char* s, size_t n) { + wchar_t wc; static mbstate_t ps; + size_t result = mbrtowc(&wc, s, n, &ps); if ( !s ) - { memset(&ps, 0, sizeof(ps)); - return 0; // TODO: Give the correct return value depending on ps. - } - size_t ret = mbrlen(s, n, &ps); - if ( ret == (size_t) -2 ) + if ( result == (size_t) -1 ) + return memset(&ps, 0, sizeof(ps)), -1; + // TODO: Should ps be cleared to zero in this case? + if ( result == (size_t) -2 ) return -1; - if ( ret == (size_t) -1 ) - return -1; - return (int) ret; + return (int) result; } diff --git a/libc/stdlib/mbstowcs.cpp b/libc/stdlib/mbstowcs.cpp index b899f63c..4b2dbfca 100644 --- a/libc/stdlib/mbstowcs.cpp +++ b/libc/stdlib/mbstowcs.cpp @@ -1,6 +1,6 @@ /******************************************************************************* - Copyright(C) Jonas 'Sortie' Termansen 2013. + Copyright(C) Jonas 'Sortie' Termansen 2013, 2014. This file is part of the Sortix C Library. @@ -23,16 +23,14 @@ *******************************************************************************/ #include +#include #include -extern "C" size_t mbstowcs(wchar_t* dst, const char* src, size_t n) +// TODO: This function is unpure and should be removed. +extern "C" +size_t mbstowcs(wchar_t* restrict dst, const char* restrict src, size_t n) { - // Reset the secret conversion state variable in mbsrtowcs that is used when - // ps is NULL by successfully converting the empty string. As always, this - // is not multithread secure. For some reason, the standards don't mandate - // that the conversion state is reset when mbsrtowcs is called with ps=NULL, - // which arguably is a feature - but this function is supposed to do it. - const char* empty_string = ""; - mbsrtowcs(NULL, &empty_string, 0, NULL); - return mbsrtowcs(dst, &src, n, NULL); + mbstate_t ps; + memset(&ps, 0, sizeof(ps)); + return mbsrtowcs(dst, (const char**) &src, n, &ps); } diff --git a/libc/stdlib/mbtowc.cpp b/libc/stdlib/mbtowc.cpp index fffdee4f..fc410a1d 100644 --- a/libc/stdlib/mbtowc.cpp +++ b/libc/stdlib/mbtowc.cpp @@ -1,6 +1,6 @@ /******************************************************************************* - Copyright(C) Jonas 'Sortie' Termansen 2011, 2012. + Copyright(C) Jonas 'Sortie' Termansen 2011, 2012, 2014. This file is part of the Sortix C Library. @@ -24,10 +24,20 @@ #include #include +#include #include // TODO: This function is unpure and should be removed. -extern "C" int mbtowc(wchar_t* pwd, const char* s, size_t n) +extern "C" int mbtowc(wchar_t* pwc, const char* s, size_t n) { - return mbrtowc(pwd, s, n, NULL); + static mbstate_t ps; + size_t result = mbrtowc(pwc, s, n, &ps); + if ( !s ) + memset(&ps, 0, sizeof(ps)); + if ( result == (size_t) -1 ) + return memset(&ps, 0, sizeof(ps)), -1; + // TODO: Should ps be cleared to zero in this case? + if ( result == (size_t) -2 ) + return -1; + return (int) result; } diff --git a/libc/stdlib/wcstombs.cpp b/libc/stdlib/wcstombs.cpp index d305ff0a..c6cf6be1 100644 --- a/libc/stdlib/wcstombs.cpp +++ b/libc/stdlib/wcstombs.cpp @@ -1,6 +1,6 @@ /******************************************************************************* - Copyright(C) Jonas 'Sortie' Termansen 2013. + Copyright(C) Jonas 'Sortie' Termansen 2013, 2014. This file is part of the Sortix C Library. @@ -23,16 +23,13 @@ *******************************************************************************/ #include +#include #include +// TODO: This function is unpure and should be removed. extern "C" size_t wcstombs(char* dst, const wchar_t* src, size_t n) { - // Reset the secret conversion state variable in wcsrtombs that is used when - // ps is NULL by successfully converting the empty string. As always, this - // is not multithread secure. For some reason, the standards don't mandate - // that the conversion state is reset when wcsrtombs is called with ps=NULL, - // which arguably is a feature - but this function is supposed to do it. - const wchar_t* empty_string = L""; - wcsrtombs(NULL, &empty_string, 0, NULL); - return wcsrtombs(dst, &src, n, NULL); + mbstate_t ps; + memset(&ps, 0, sizeof(ps)); + return wcsrtombs(dst, &src, n, &ps); } diff --git a/libc/stdlib/wctomb.cpp b/libc/stdlib/wctomb.cpp index ecbf22df..a2bb5c70 100644 --- a/libc/stdlib/wctomb.cpp +++ b/libc/stdlib/wctomb.cpp @@ -1,6 +1,6 @@ /******************************************************************************* - Copyright(C) Jonas 'Sortie' Termansen 2012. + Copyright(C) Jonas 'Sortie' Termansen 2012, 2014. This file is part of the Sortix C Library. @@ -23,10 +23,19 @@ *******************************************************************************/ #include +#include #include // TODO: This function is unpure and should be removed. extern "C" int wctomb(char* s, wchar_t wc) { - return wcrtomb(s, wc, NULL); + static mbstate_t ps; + size_t result = wcrtomb(s, wc, &ps); + if ( !s ) + memset(&ps, 0, sizeof(ps)); + if ( result == (size_t) -1 ) + return -1; + if ( result == (size_t) -2 ) + return -1; + return (int) result; } diff --git a/libc/wchar/mbrlen.cpp b/libc/wchar/mbrlen.cpp index d9417009..2589f837 100644 --- a/libc/wchar/mbrlen.cpp +++ b/libc/wchar/mbrlen.cpp @@ -1,6 +1,6 @@ /******************************************************************************* - Copyright(C) Jonas 'Sortie' Termansen 2013. + Copyright(C) Jonas 'Sortie' Termansen 2013, 2014. This file is part of the Sortix C Library. @@ -22,64 +22,11 @@ *******************************************************************************/ -#include -#include #include -static size_t utf8_header_length(unsigned char uc) -{ - if ( (uc & 0b11000000) == 0b10000000 ) - return 0; - if ( (uc & 0b10000000) == 0b00000000 ) - return 1; - if ( (uc & 0b11100000) == 0b11000000 ) - return 2; - if ( (uc & 0b11110000) == 0b11100000 ) - return 3; - if ( (uc & 0b11111000) == 0b11110000 ) - return 4; - if ( (uc & 0b11111100) == 0b11111000 ) - return 5; - if ( (uc & 0b11111110) == 0b11111100 ) - return 6; - return (size_t) -1; -} - -// TODO: Use the shift state. extern "C" size_t mbrlen(const char* restrict s, size_t n, mbstate_t* restrict ps) { - size_t expected_length; - - for ( size_t i = 0; i < n; i++ ) - { - unsigned char uc = (unsigned char) s[i]; - - if ( i == 0 ) - { - if ( !uc ) - { - memset(ps, 0, sizeof(*ps)); - return 0; - } - - if ( (expected_length = utf8_header_length(uc)) == (size_t) -1 ) - return errno = EILSEQ, (size_t) -1; - - // Check if we encounted an unexpected character claiming to be in - // the middle of a UTF-8 multibyte sequence (10xxxxxx). - if ( expected_length == 0 ) - // TODO: Should we play catch up with the partial sequence? - return errno = EILSEQ, (size_t) -1; - } - - // All non-header bytes should be of the form 10xxxxxx. - if ( 0 < i && expected_length < n && (uc & 0b11000000) != 0b10000000 ) - return errno = EILSEQ, (size_t) -1; - - if ( i + 1 == expected_length ) - return i + 1; - } - - return (size_t) -2; + static mbstate_t static_ps; + return mbrtowc(NULL, s, n, ps ? ps : &static_ps); } diff --git a/libc/wchar/mbrtowc.cpp b/libc/wchar/mbrtowc.cpp index 2c82644b..ac663178 100644 --- a/libc/wchar/mbrtowc.cpp +++ b/libc/wchar/mbrtowc.cpp @@ -1,6 +1,6 @@ /******************************************************************************* - Copyright(C) Jonas 'Sortie' Termansen 2012. + Copyright(C) Jonas 'Sortie' Termansen 2012, 2014. This file is part of the Sortix C Library. @@ -24,82 +24,123 @@ #include #include +#include #include -extern "C" -size_t mbrtowc(wchar_t* restrict pwc, const char* restrict s, size_t n, - mbstate_t* restrict /*ps*/) +static +size_t utf8_mbrtowc(wchar_t* restrict pwc, + const char* restrict s, + size_t n, + mbstate_t* restrict ps) { - if ( !s ) + size_t i; + for ( i = 0; !(i && ps->count == 0); i++ ) { - // TODO: Restore ps to initial state if currently valid. - return 0; - } - uint8_t* buf = (uint8_t*) s; - wchar_t ret = 0; - size_t numbytes = 0; - size_t sequence_len = 1; - while ( numbytes < sequence_len ) - { - if ( numbytes == n ) - { - // TODO: Support restore through the mbstate_t! + // Handle the case where we were not able to fully decode a character, + // but it is still possible to finish decoding given more bytes. + if ( n <= i ) return (size_t) -2; + + char c = s[i]; + unsigned char uc = (unsigned char) c; + + // The initial state is that we expect a leading byte that informs us of + // the length of this character sequence. The number of consecutive high + // order bits tells us how many bytes make up this character (one + // leading byte followed by zero or more continuation bytes). + if ( ps->count == 0 ) + { + if ( (uc & 0b10000000) == 0b00000000 ) /* 0xxxxxxx */ + { + ps->length = (ps->count = 0) + 1; + ps->wch = (wchar_t) uc & 0b1111111; + } + else if ( (uc & 0b11100000) == 0b11000000 ) /* 110xxxxx */ + { + ps->length = (ps->count = 1) + 1; + ps->wch = (wchar_t) uc & 0b11111; + } + else if ( (uc & 0b11110000) == 0b11100000 ) /* 1110xxxx */ + { + ps->length = (ps->count = 2) + 1; + ps->wch = (wchar_t) uc & 0b1111; + } + else if ( (uc & 0b11111000) == 0b11110000 ) /* 11110xxx */ + { + ps->length = (ps->count = 3) + 1; + ps->wch = (wchar_t) uc & 0b111; + } +#if 0 /* 5-byte and 6-byte sequences are forbidden by RFC 3629 */ + else if ( (uc & 0b11111100) == 0b11111000 ) /* 111110xx */ + { + ps->length = (ps->count = 4) + 1) + 1; + ps->wch = (wchar_t) uc & 0b11; + } + else if ( (uc & 0b11111110) == 0b11111100 ) /* 1111110x */ + { + ps->length = (ps->count = 5) + 1) + 1; + ps->wch = (wchar_t) uc & 0b1; + } +#endif + else + return errno = EILSEQ, (size_t) -1; } - uint8_t b = buf[numbytes++]; - bool is_continuation = b >> (8-2) == 0b10; - if ( 1 == numbytes && is_continuation ) - return errno = EILSEQ, (size_t) -1; - if ( 2 <= numbytes && !is_continuation ) - return errno = EILSEQ, (size_t) -1; - - wchar_t new_bits; - size_t new_bits_num; - if ( b >> (8-1) == 0b0 ) - new_bits = b & 0b01111111, - new_bits_num = 7, - sequence_len = 1; - else if ( b >> (8-2) == 0b10 ) - new_bits = b & 0b00111111, - new_bits_num = 6, - sequence_len = 2; - else if ( b >> (8-3) == 0b110 ) - new_bits = b & 0b00011111, - new_bits_num = 5, - sequence_len = 3; - else if ( b >> (8-4) == 0b1110 ) - new_bits = b & 0b00001111, - new_bits_num = 4, - sequence_len = 4; - else if ( b >> (8-5) == 0b11110 ) - new_bits = b & 0b00000111, - new_bits_num = 3, - sequence_len = 5; - else if ( b >> (8-6) == 0b111110 ) - new_bits = b & 0b00000011, - new_bits_num = 2, - sequence_len = 6; - else if ( b >> (8-7) == 0b1111110 ) - new_bits = b & 0b00000001, - new_bits_num = 1, - sequence_len = 7; + // The secondary state is that following a leading byte, we are + // expecting a non-zero number of continuation byte bytes. else - return errno = EILSEQ, (size_t) -1; - ret = ret >> new_bits_num | new_bits; + { + // Verify this is a continuation byte. + if ( (uc & 0b11000000) != 0b10000000 ) + return errno = EILSEQ, (size_t) -1; + ps->wch = ps->wch << 6 | (uc & 0b00111111); + ps->count--; + } } - if ( !ret ) - { - // TODO: Reset ps to initial state. - return 0; - } - if ( (numbytes == 2 && ret <= 0x007F) || - (numbytes == 3 && ret <= 0x07FF) || - (numbytes == 4 && ret <= 0xFFFF) || - (numbytes == 5 && ret <= 0x1FFFFF) || - (numbytes == 6 && ret <= 0x3FFFFFF) ) + + // Reject the character if it was produced with an overly long sequence. + if ( ps->length == 1 && 1 << 7 <= ps->wch ) return errno = EILSEQ, (size_t) -1; + if ( ps->length == 2 && 1 << (5 + 1 * 6) <= ps->wch ) + return errno = EILSEQ, (size_t) -1; + if ( ps->length == 3 && 1 << (4 + 2 * 6) <= ps->wch ) + return errno = EILSEQ, (size_t) -1; + if ( ps->length == 4 && 1 << (3 + 3 * 6) <= ps->wch ) + return errno = EILSEQ, (size_t) -1; +#if 0 /* 5-byte and 6-byte sequences are forbidden by RFC 3629 */ + if ( ps->length == 5 && 1 << (2 + 4 * 6) <= ps->wch ) + return errno = EILSEQ, (size_t) -1; + if ( ps->length == 6 && 1 << (1 + 5 * 6) <= ps->wch ) + return errno = EILSEQ, (size_t) -1; +#endif + + // RFC 3629 limits UTF-8 to 0x0 through 0x10FFFF. + if ( 0x10FFFF <= ps->wch ) + return errno = EILSEQ, (size_t) -1; + + wchar_t result = ps->wch; + if ( pwc ) - *pwc = ret; - return numbytes; + *pwc = result; + + ps->length = 0; + ps->wch = 0; + + return result != L'\0' ? i : 0; +} + +extern "C" +size_t mbrtowc(wchar_t* restrict pwc, + const char* restrict s, + size_t n, + mbstate_t* restrict ps) +{ + static mbstate_t static_ps; + if ( !ps ) + ps = &static_ps; + if ( !s ) + s = "", n = 1; + + // TODO: Verify whether the current locale is UTF-8. + return utf8_mbrtowc(pwc, s, n, ps); } diff --git a/libc/wchar/mbsinit.cpp b/libc/wchar/mbsinit.cpp new file mode 100644 index 00000000..1bc8ac51 --- /dev/null +++ b/libc/wchar/mbsinit.cpp @@ -0,0 +1,30 @@ +/******************************************************************************* + + Copyright(C) Jonas 'Sortie' Termansen 2014. + + This file is part of the Sortix C Library. + + The Sortix C Library is free software: you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation, either version 3 of the License, or (at your + option) any later version. + + The Sortix C Library is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with the Sortix C Library. If not, see . + + wchar/mbsinit.cpp + Determine conversion object status. + +*******************************************************************************/ + +#include + +extern "C" int mbsinit(const mbstate_t* ps) +{ + return !ps || !ps->count; +} diff --git a/libc/wchar/mbsnrtowcs.cpp b/libc/wchar/mbsnrtowcs.cpp new file mode 100644 index 00000000..53c919e3 --- /dev/null +++ b/libc/wchar/mbsnrtowcs.cpp @@ -0,0 +1,82 @@ +/******************************************************************************* + + Copyright(C) Jonas 'Sortie' Termansen 2013, 2014. + + This file is part of the Sortix C Library. + + The Sortix C Library is free software: you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation, either version 3 of the License, or (at your + option) any later version. + + The Sortix C Library is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with the Sortix C Library. If not, see . + + wchar/mbsnrtowcs.cpp + Convert a multibyte string to a wide-character string. + +*******************************************************************************/ + +#include +#include + +extern "C" +size_t mbsnrtowcs(wchar_t* restrict dst, + const char** restrict src_ptr, + size_t src_len, + size_t dst_len, + mbstate_t* restrict ps) +{ + static mbstate_t static_ps; + if ( !ps ) + ps = &static_ps; + + assert(src_ptr && *src_ptr); + const char* src = *src_ptr; + + // Continue to decode wide characters until we have filled the destination + // buffer or if we have exhausted the limit on input multibyte characters. + size_t dst_offset = 0; + size_t src_offset = 0; + while ( (!dst || dst_offset < dst_len) && src_offset < src_len ) + { + mbstate_t ps_copy = *ps; + wchar_t wc; + size_t amount = mbrtowc(&wc, src + src_offset, src_len - src_offset, ps); + + // Stop in the event a decoding error occured. + if ( amount == (size_t) -1 ) + return *src_ptr = src + src_offset, (size_t) -1; + + // Stop decoding early in the event we encountered a partial character. + if ( amount == (size_t) -2 ) + { + *ps = ps_copy; + break; + } + + // Store the decoded wide character in the destination buffer. + if ( dst ) + dst[dst_offset] = wc; + + // Stop decoding after decoding a null character and return a NULL + // source pointer to the caller, not including the null character in the + // number of characters stored in the destination buffer. + if ( wc == L'\0' ) + { + src = NULL; + src_offset = 0; + break; + } + + dst_offset++; + src_offset += amount; + } + + return *src_ptr = src + src_offset, dst_offset; +} diff --git a/libc/wchar/mbsrtowcs.cpp b/libc/wchar/mbsrtowcs.cpp index 0808ea89..0289d266 100644 --- a/libc/wchar/mbsrtowcs.cpp +++ b/libc/wchar/mbsrtowcs.cpp @@ -1,6 +1,6 @@ /******************************************************************************* - Copyright(C) Jonas 'Sortie' Termansen 2013. + Copyright(C) Jonas 'Sortie' Termansen 2013, 2014. This file is part of the Sortix C Library. @@ -24,49 +24,16 @@ #include #include +#include #include #include #include -extern "C" size_t mbsrtowcs(wchar_t* dst, const char** src_ptr, size_t dst_len, - mbstate_t* ps) +extern "C" +size_t mbsrtowcs(wchar_t* restrict dst, + const char** restrict src_ptr, + size_t dst_len, + mbstate_t* restrict ps) { - assert(src_ptr && *src_ptr); - // Avoid changing *src_ptr if dst is NULL. - const char* local_src_ptr = *src_ptr; - if ( !dst ) - src_ptr = &local_src_ptr; - // For some reason, the standards don't mandate that the secret ps variable - // is reset when ps is NULL, unlike mbstowcs that always resets this - // variable. We'll avoid resetting the variable here in case any programs - // actually take advantage of this fact. - static mbstate_t static_ps; - if ( !ps ) - ps = &static_ps; - size_t ret = 0; - size_t src_len = strlen(*src_ptr); - while ( !dst || dst_len ) - { - mbstate_t saved_ps = *ps; - size_t consumed = mbrtowc(dst, *src_ptr, src_len, ps); - if ( consumed == (size_t) 0 ) - { - *src_ptr = NULL; - break; - } - if ( consumed == (size_t) -1 ) - return (size_t) -1; - if ( consumed == (size_t) -2 ) - { - *ps = saved_ps; - break; - } - *src_ptr += consumed; - src_len -= consumed; - if ( dst ) - dst++, - dst_len--; - ret++; - } - return ret; + return mbsnrtowcs(dst, src_ptr, SIZE_MAX, dst_len, ps); } diff --git a/libc/wchar/wcrtomb.cpp b/libc/wchar/wcrtomb.cpp index ae620d05..32bea61b 100644 --- a/libc/wchar/wcrtomb.cpp +++ b/libc/wchar/wcrtomb.cpp @@ -1,6 +1,6 @@ /******************************************************************************* - Copyright(C) Jonas 'Sortie' Termansen 2012. + Copyright(C) Jonas 'Sortie' Termansen 2012, 2014. This file is part of the Sortix C Library. @@ -23,58 +23,87 @@ *******************************************************************************/ #include -#include +#include #include -extern "C" -size_t wcrtomb(char* restrict s, wchar_t wc, mbstate_t* restrict /*ps*/) +static +size_t utf8_wcrtomb(char* restrict s, wchar_t wc, mbstate_t* restrict /*ps*/) { - if ( !wc ) + // The definition of UTF-8 prohibits encoding character numbers between + // U+D800 and U+DFFF, which are reserved for use with the UTF-16 encoding + // form (as surrogate pairs) and do not directly represent characters. + if ( 0xD800 <= wc && wc <= 0xDFFF ) + return errno = EILSEQ, (size_t) -1; + + // RFC 3629 limits UTF-8 to 0x0 through 0x10FFFF. + if ( 0x10FFFF <= wc ) + return errno = EILSEQ, (size_t) -1; + + size_t index = 0; + + if ( wc < (1 << (7)) ) /* 0xxxxxxx */ { - if ( s ) - *s = '\0'; - return 1; + s[index++] = 0b00000000 | (wc >> 0 & 0b01111111); + return index; } - uint32_t unicode = wc; - uint8_t* buf = (uint8_t*) s; - unsigned bytes = 1; - unsigned bits = 7; - if ( (1U<<7U) <= unicode ) { bytes = 2; bits = 11; } - if ( (1U<<11U) <= unicode ) { bytes = 3; bits = 16; } - if ( (1U<<16U) <= unicode ) { bytes = 4; bits = 21; } - if ( (1U<<21U) <= unicode ) { bytes = 5; bits = 26; } - if ( (1U<<26U) <= unicode ) { bytes = 6; bits = 31; } - if ( (1U<<31U) <= unicode ) { errno = EILSEQ; return (size_t) -1; } - - if ( !s ) - return bytes; - - uint8_t prefix; - unsigned prefixavai; - switch ( bytes ) + if ( wc < (1 << (5 + 1 * 6)) ) /* 110xxxxx 10xxxxxx^1 */ { - case 1: prefixavai = 7; prefix = 0b0U << prefixavai; break; - case 2: prefixavai = 5; prefix = 0b110U << prefixavai; break; - case 3: prefixavai = 4; prefix = 0b1110U << prefixavai; break; - case 4: prefixavai = 3; prefix = 0b11110U << prefixavai; break; - case 5: prefixavai = 2; prefix = 0b111110U << prefixavai; break; - case 6: prefixavai = 1; prefix = 0b1111110U << prefixavai; break; - default: __builtin_unreachable(); + s[index++] = 0b11000000 | (wc >> 6 & 0b00011111); + s[index++] = 0b10000000 | (wc >> 0 & 0b00111111); + return index; } - // Put the first bits in the unused area of the prefix. - prefix |= unicode >> (bits - prefixavai); - *buf++ = prefix; - unsigned bitsleft = bits - prefixavai; - - while ( bitsleft ) + if ( wc < (1 << (4 + 2 * 6)) ) /* 1110xxxx 10xxxxxx^2 */ { - bitsleft -= 6; - uint8_t elembits = (unicode>>bitsleft) & ((1U<<6U)-1U); - uint8_t elem = (0b10U<<6U) | elembits; - *buf++ = elem; + s[index++] = 0b11100000 | (wc >> 2*6 & 0b00001111); + s[index++] = 0b10000000 | (wc >> 1*6 & 0b00111111); + s[index++] = 0b10000000 | (wc >> 0*6 & 0b00111111); + return index; } - return bytes; + if ( wc < (1 << (3 + 3 * 6)) ) /* 11110xxx 10xxxxxx^3 */ + { + s[index++] = 0b11110000 | (wc >> 3*6 & 0b00000111); + s[index++] = 0b10000000 | (wc >> 2*6 & 0b00111111); + s[index++] = 0b10000000 | (wc >> 1*6 & 0b00111111); + s[index++] = 0b10000000 | (wc >> 0*6 & 0b00111111); + return index; + } + +#if 0 /* 5-byte and 6-byte sequences are forbidden by RFC 3629 */ + if ( wc < (1 << (2 + 4 * 6)) ) /* 111110xx 10xxxxxx^4 */ + { + s[index++] = 0b11111000 | (wc >> 4*6 & 0b00000011); + s[index++] = 0b10000000 | (wc >> 3*6 & 0b00111111); + s[index++] = 0b10000000 | (wc >> 2*6 & 0b00111111); + s[index++] = 0b10000000 | (wc >> 1*6 & 0b00111111); + s[index++] = 0b10000000 | (wc >> 0*6 & 0b00111111); + return index; + } + + if ( wc < (1 << (1 + 5 * 6)) ) /* 111110xx 10xxxxxx^5 */ + { + s[index++] = 0b11111100 | (wc >> 5*6 & 0b00000001); + s[index++] = 0b10000000 | (wc >> 4*6 & 0b00111111); + s[index++] = 0b10000000 | (wc >> 3*6 & 0b00111111); + s[index++] = 0b10000000 | (wc >> 2*6 & 0b00111111); + s[index++] = 0b10000000 | (wc >> 1*6 & 0b00111111); + s[index++] = 0b10000000 | (wc >> 0*6 & 0b00111111); + return index; + } +#endif + + return errno = EILSEQ; return (size_t) -1; +} + +extern "C" +size_t wcrtomb(char* restrict s, wchar_t wc, mbstate_t* restrict ps) +{ + char internal_buffer[MB_CUR_MAX]; + if ( !s ) + wc = L'\0', s = internal_buffer; + + // TODO: Verify whether the current locale is UTF-8. + return utf8_wcrtomb(s, wc, ps); } diff --git a/libc/wchar/wcsnrtombs.cpp b/libc/wchar/wcsnrtombs.cpp new file mode 100644 index 00000000..0e634955 --- /dev/null +++ b/libc/wchar/wcsnrtombs.cpp @@ -0,0 +1,87 @@ +/******************************************************************************* + + Copyright(C) Jonas 'Sortie' Termansen 2013, 2014. + + This file is part of the Sortix C Library. + + The Sortix C Library is free software: you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation, either version 3 of the License, or (at your + option) any later version. + + The Sortix C Library is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with the Sortix C Library. If not, see . + + wchar/wcsnrtombs.cpp + Convert a wide-character string to multibyte string. + +*******************************************************************************/ + +#include +#include +#include +#include +#include + +extern "C" +size_t wcsnrtombs(char* restrict dst, + const wchar_t** restrict src_ptr, + size_t src_len, + size_t dst_len, + mbstate_t* restrict ps) +{ + static mbstate_t static_ps; + if ( !ps ) + ps = &static_ps; + + assert(src_ptr && *src_ptr); + const wchar_t* src = *src_ptr; + + // Continue to encode multibyte characters until we have filled the + // destination buffer or if we have exhausted the limit on input wide chars. + size_t dst_offset = 0; + size_t src_offset = 0; + while ( (!dst || dst_offset < dst_len) && src_offset < src_len ) + { + mbstate_t ps_copy = *ps; + wchar_t wc = src[src_offset]; + char mb[MB_CUR_MAX]; + size_t amount = wcrtomb(mb, wc, ps); + + // Stop in the event a decoding error occured. + if ( amount == (size_t) -1 ) + return *src_ptr = src + src_offset, (size_t) -1; + + // Stop decoding early in the event we encountered a partial character, + // or that we ran out of space in the destination buffer. + if ( amount == (size_t) -2 || (dst && dst_offset - dst_len < amount ) ) + { + *ps = ps_copy; + break; + } + + // Store the decoded multibyte character in the destination buffer. + if ( dst ) + memcpy(dst + dst_offset, mb, amount); + + // Stop decoding after decoding a null character and return a NULL + // source pointer to the caller, not including the null character in the + // number of characters stored in the destination buffer. + if ( wc == L'\0' ) + { + src = NULL; + src_offset = 0; + break; + } + + dst_offset += amount; + src_offset++; + } + + return *src_ptr = src + src_offset, dst_offset; +} diff --git a/libc/wchar/wcsrtombs.cpp b/libc/wchar/wcsrtombs.cpp index a616e30f..09c7067d 100644 --- a/libc/wchar/wcsrtombs.cpp +++ b/libc/wchar/wcsrtombs.cpp @@ -1,6 +1,6 @@ /******************************************************************************* - Copyright(C) Jonas 'Sortie' Termansen 2013. + Copyright(C) Jonas 'Sortie' Termansen 2013, 2014. This file is part of the Sortix C Library. @@ -22,55 +22,14 @@ *******************************************************************************/ -#include -#include -#include -#include +#include #include -extern "C" size_t wcsrtombs(char* dst, const wchar_t** src_ptr, size_t dst_len, - mbstate_t* ps) +extern "C" +size_t wcsrtombs(char* restrict dst, + const wchar_t** restrict src_ptr, + size_t dst_len, + mbstate_t* ps) { - assert(src_ptr && *src_ptr); - // Avoid changing *src_ptr if dst is NULL. - const wchar_t* local_src_ptr = *src_ptr; - if ( !dst ) - src_ptr = &local_src_ptr; - // For some reason, the standards don't mandate that the secret ps variable - // is reset when ps is NULL, unlike mbstowcs that always resets this - // variable. We'll avoid resetting the variable here in case any programs - // actually take advantage of this fact. - static mbstate_t static_ps; - if ( !ps ) - ps = &static_ps; - size_t ret = 0; - size_t src_len = wcslen(*src_ptr); - char buf[MB_CUR_MAX]; - while ( !dst || dst_len ) - { - mbstate_t saved_ps = *ps; - size_t produced = wcrtomb(buf, **src_ptr, ps); - if ( produced == (size_t) -1 ) - return (size_t) -1; - if ( dst && dst_len < produced ) - { - *ps = saved_ps; - break; - } - memcpy(dst, buf, produced); - if ( **src_ptr == L'\0' ) - { - ret += produced - 1; // Don't count the '\0' byte. - *src_ptr = NULL; - break; - } - ret += produced; - (*src_ptr)++; - src_len--; - if ( dst ) - dst += produced, - dst_len -= produced; - ret++; - } - return ret; + return wcsnrtombs(dst, src_ptr, SIZE_MAX, dst_len, ps); }