From f41964fcab0d34c510aa0e6a2a4471b6770937b3 Mon Sep 17 00:00:00 2001
From: Jonas 'Sortie' Termansen <sortie@maxsi.org>
Date: Thu, 17 Apr 2014 17:41:37 +0200
Subject: [PATCH] Reimplement wchar conversion API.

---
 libc/Makefile             |   3 +
 libc/include/stdlib.h     |   5 +-
 libc/include/wchar.h      |  22 ++---
 libc/stdlib/mblen.cpp     |  18 ++--
 libc/stdlib/mbstowcs.cpp  |  18 ++--
 libc/stdlib/mbtowc.cpp    |  16 +++-
 libc/stdlib/wcstombs.cpp  |  15 ++--
 libc/stdlib/wctomb.cpp    |  13 ++-
 libc/wchar/mbrlen.cpp     |  59 +------------
 libc/wchar/mbrtowc.cpp    | 177 +++++++++++++++++++++++---------------
 libc/wchar/mbsinit.cpp    |  30 +++++++
 libc/wchar/mbsnrtowcs.cpp |  82 ++++++++++++++++++
 libc/wchar/mbsrtowcs.cpp  |  49 ++---------
 libc/wchar/wcrtomb.cpp    | 115 ++++++++++++++++---------
 libc/wchar/wcsnrtombs.cpp |  87 +++++++++++++++++++
 libc/wchar/wcsrtombs.cpp  |  57 ++----------
 16 files changed, 463 insertions(+), 303 deletions(-)
 create mode 100644 libc/wchar/mbsinit.cpp
 create mode 100644 libc/wchar/mbsnrtowcs.cpp
 create mode 100644 libc/wchar/wcsnrtombs.cpp

diff --git a/libc/Makefile b/libc/Makefile
index c8b27f9c..82557455 100644
--- a/libc/Makefile
+++ b/libc/Makefile
@@ -238,6 +238,8 @@ time/strftime.o \
 time/timegm.o \
 wchar/mbrlen.o \
 wchar/mbrtowc.o \
+wchar/mbsinit.o \
+wchar/mbsnrtowcs.o \
 wchar/mbsrtowcs.o \
 wchar/wcrtomb.o \
 wchar/wcscat.o \
@@ -251,6 +253,7 @@ wchar/wcslen.o \
 wchar/wcsncat.o \
 wchar/wcsncmp.o \
 wchar/wcsncpy.o \
+wchar/wcsnrtombs.o \
 wchar/wcspbrk.o \
 wchar/wcsrchr.o \
 wchar/wcsrtombs.o \
diff --git a/libc/include/stdlib.h b/libc/include/stdlib.h
index f2d38f3a..f138332c 100644
--- a/libc/include/stdlib.h
+++ b/libc/include/stdlib.h
@@ -1,6 +1,6 @@
 /*******************************************************************************
 
-    Copyright(C) Jonas 'Sortie' Termansen 2011, 2012, 2013.
+    Copyright(C) Jonas 'Sortie' Termansen 2011, 2012, 2013, 2014.
 
     This file is part of the Sortix C Library.
 
@@ -39,8 +39,7 @@ __BEGIN_DECLS
 /* TODO: This random interface is stupid. What should a good value be? */
 #define RAND_MAX 32767
 
-/* TODO: This is just a value. It's not a compile time constant! */
-#define MB_CUR_MAX 16
+#define MB_CUR_MAX 6
 
 typedef struct
 {
diff --git a/libc/include/wchar.h b/libc/include/wchar.h
index b092a4e8..0f7f7fb9 100644
--- a/libc/include/wchar.h
+++ b/libc/include/wchar.h
@@ -105,12 +105,15 @@ typedef __wint_t wint_t;
 /* Conversion state information. */
 typedef struct
 {
-	int __count;
-	union
-	{
-		wint_t __wch;
-		char __wchb[4];
-	} __value;		/* Value so far. */
+#if defined(__is_sortix_libc)
+	unsigned short count;
+	unsigned short length;
+	wint_t wch;
+#else
+	unsigned short __count;
+	unsigned short __length;
+	wint_t __wch;
+#endif
 } mbstate_t;
 #define __mbstate_t_defined 1
 #endif
@@ -126,12 +129,11 @@ struct tm;
 /* TODO: wint_t getwchar(void); */
 size_t mbrlen(const char* __restrict, size_t, mbstate_t* __restrict);
 size_t mbrtowc(wchar_t* __restrict, const char* __restrict, size_t, mbstate_t* __restrict);
-/* TODO: int mbsinit(const mbstate_t*); */
+int mbsinit(const mbstate_t*);
 size_t mbsrtowcs(wchar_t* __restrict, const char** __restrict, size_t, mbstate_t* __restrict);
 /* TODO: wint_t putwc(wchar_t, FILE*); */
 /* TODO: wint_t putwchar(wchar_t); */
 /* TODO: wint_t ungetwc(wint_t, FILE*); */
-
 size_t wcrtomb(char* __restrict, wchar_t, mbstate_t* __restrict);
 wchar_t* wcscat(wchar_t* __restrict, const wchar_t* __restrict);
 wchar_t* wcschr(const wchar_t*, wchar_t);
@@ -193,7 +195,7 @@ int wcwidth(wchar_t);
 
 /* Functions from POSIX 2008. */
 #if __USE_SORTIX || 200809L <= __USE_POSIX
-/* TODO: size_t mbsnrtowcs(wchar_t* __restrict, const char** __restrict, size_t, size_t, mbstate_t* __restrict); */
+size_t mbsnrtowcs(wchar_t* __restrict, const char** __restrict, size_t, size_t, mbstate_t* __restrict);
 /* TODO: FILE* open_wmemstream(wchar_t**, size_t*); */
 /* TODO: wchar_t* wcpcpy(wchar_t* __restrict, const wchar_t* __restrict); */
 /* TODO: wchar_t* wcpncpy(wchar_t* __restrict, const wchar_t* __restrict, size_t); */
@@ -204,7 +206,7 @@ int wcwidth(wchar_t);
 /* TODO: int wcsncasecmp(const wchar_t*, const wchar_t *, size_t); */
 /* TODO: int wcsncasecmp_l(const wchar_t*, const wchar_t *, size_t, locale_t); */
 /* TODO: size_t wcsnlen(const wchar_t*, size_t); */
-/* TODO: size_t wcsnrtombs(char* __restrict, const wchar_t** __restrict, size_t, size_t, mbstate_t* __restrict); */
+size_t wcsnrtombs(char* __restrict, const wchar_t** __restrict, size_t, size_t, mbstate_t* __restrict);
 /* TODO: size_t wcsxfrm_l(wchar_t* __restrict, const wchar_t* __restrict, size_t, locale_t); */
 #endif
 
diff --git a/libc/stdlib/mblen.cpp b/libc/stdlib/mblen.cpp
index c4378cbd..ac88ab2d 100644
--- a/libc/stdlib/mblen.cpp
+++ b/libc/stdlib/mblen.cpp
@@ -1,6 +1,6 @@
 /*******************************************************************************
 
-    Copyright(C) Jonas 'Sortie' Termansen 2013.
+    Copyright(C) Jonas 'Sortie' Termansen 2013, 2014.
 
     This file is part of the Sortix C Library.
 
@@ -26,18 +26,18 @@
 #include <string.h>
 #include <wchar.h>
 
+// TODO: This function is unpure and should be removed.
 extern "C" int mblen(const char* s, size_t n)
 {
+	wchar_t wc;
 	static mbstate_t ps;
+	size_t result = mbrtowc(&wc, s, n, &ps);
 	if ( !s )
-	{
 		memset(&ps, 0, sizeof(ps));
-		return 0; // TODO: Give the correct return value depending on ps.
-	}
-	size_t ret = mbrlen(s, n, &ps);
-	if ( ret == (size_t) -2 )
+	if ( result == (size_t) -1 )
+		return memset(&ps, 0, sizeof(ps)), -1;
+	// TODO: Should ps be cleared to zero in this case?
+	if ( result == (size_t) -2 )
 		return -1;
-	if ( ret == (size_t) -1 )
-		return -1;
-	return (int) ret;
+	return (int) result;
 }
diff --git a/libc/stdlib/mbstowcs.cpp b/libc/stdlib/mbstowcs.cpp
index b899f63c..4b2dbfca 100644
--- a/libc/stdlib/mbstowcs.cpp
+++ b/libc/stdlib/mbstowcs.cpp
@@ -1,6 +1,6 @@
 /*******************************************************************************
 
-    Copyright(C) Jonas 'Sortie' Termansen 2013.
+    Copyright(C) Jonas 'Sortie' Termansen 2013, 2014.
 
     This file is part of the Sortix C Library.
 
@@ -23,16 +23,14 @@
 *******************************************************************************/
 
 #include <stdlib.h>
+#include <string.h>
 #include <wchar.h>
 
-extern "C" size_t mbstowcs(wchar_t* dst, const char* src, size_t n)
+// TODO: This function is unpure and should be removed.
+extern "C"
+size_t mbstowcs(wchar_t* restrict dst, const char* restrict src, size_t n)
 {
-	// Reset the secret conversion state variable in mbsrtowcs that is used when
-	// ps is NULL by successfully converting the empty string. As always, this
-	// is not multithread secure. For some reason, the standards don't mandate
-	// that the conversion state is reset when mbsrtowcs is called with ps=NULL,
-	// which arguably is a feature - but this function is supposed to do it.
-	const char* empty_string = "";
-	mbsrtowcs(NULL, &empty_string, 0, NULL);
-	return mbsrtowcs(dst, &src, n, NULL);
+	mbstate_t ps;
+	memset(&ps, 0, sizeof(ps));
+	return mbsrtowcs(dst, (const char**) &src, n, &ps);
 }
diff --git a/libc/stdlib/mbtowc.cpp b/libc/stdlib/mbtowc.cpp
index fffdee4f..fc410a1d 100644
--- a/libc/stdlib/mbtowc.cpp
+++ b/libc/stdlib/mbtowc.cpp
@@ -1,6 +1,6 @@
 /*******************************************************************************
 
-    Copyright(C) Jonas 'Sortie' Termansen 2011, 2012.
+    Copyright(C) Jonas 'Sortie' Termansen 2011, 2012, 2014.
 
     This file is part of the Sortix C Library.
 
@@ -24,10 +24,20 @@
 
 #include <stdio.h>
 #include <stdlib.h>
+#include <string.h>
 #include <wchar.h>
 
 // TODO: This function is unpure and should be removed.
-extern "C" int mbtowc(wchar_t* pwd, const char* s, size_t n)
+extern "C" int mbtowc(wchar_t* pwc, const char* s, size_t n)
 {
-	return mbrtowc(pwd, s, n, NULL);
+	static mbstate_t ps;
+	size_t result = mbrtowc(pwc, s, n, &ps);
+	if ( !s )
+		memset(&ps, 0, sizeof(ps));
+	if ( result == (size_t) -1 )
+		return memset(&ps, 0, sizeof(ps)), -1;
+	// TODO: Should ps be cleared to zero in this case?
+	if ( result == (size_t) -2 )
+		return -1;
+	return (int) result;
 }
diff --git a/libc/stdlib/wcstombs.cpp b/libc/stdlib/wcstombs.cpp
index d305ff0a..c6cf6be1 100644
--- a/libc/stdlib/wcstombs.cpp
+++ b/libc/stdlib/wcstombs.cpp
@@ -1,6 +1,6 @@
 /*******************************************************************************
 
-    Copyright(C) Jonas 'Sortie' Termansen 2013.
+    Copyright(C) Jonas 'Sortie' Termansen 2013, 2014.
 
     This file is part of the Sortix C Library.
 
@@ -23,16 +23,13 @@
 *******************************************************************************/
 
 #include <stdlib.h>
+#include <string.h>
 #include <wchar.h>
 
+// TODO: This function is unpure and should be removed.
 extern "C" size_t wcstombs(char* dst, const wchar_t* src, size_t n)
 {
-	// Reset the secret conversion state variable in wcsrtombs that is used when
-	// ps is NULL by successfully converting the empty string. As always, this
-	// is not multithread secure. For some reason, the standards don't mandate
-	// that the conversion state is reset when wcsrtombs is called with ps=NULL,
-	// which arguably is a feature - but this function is supposed to do it.
-	const wchar_t* empty_string = L"";
-	wcsrtombs(NULL, &empty_string, 0, NULL);
-	return wcsrtombs(dst, &src, n, NULL);
+	mbstate_t ps;
+	memset(&ps, 0, sizeof(ps));
+	return wcsrtombs(dst, &src, n, &ps);
 }
diff --git a/libc/stdlib/wctomb.cpp b/libc/stdlib/wctomb.cpp
index ecbf22df..a2bb5c70 100644
--- a/libc/stdlib/wctomb.cpp
+++ b/libc/stdlib/wctomb.cpp
@@ -1,6 +1,6 @@
 /*******************************************************************************
 
-    Copyright(C) Jonas 'Sortie' Termansen 2012.
+    Copyright(C) Jonas 'Sortie' Termansen 2012, 2014.
 
     This file is part of the Sortix C Library.
 
@@ -23,10 +23,19 @@
 *******************************************************************************/
 
 #include <stdlib.h>
+#include <string.h>
 #include <wchar.h>
 
 // TODO: This function is unpure and should be removed.
 extern "C" int wctomb(char* s, wchar_t wc)
 {
-	return wcrtomb(s, wc, NULL);
+	static mbstate_t ps;
+	size_t result = wcrtomb(s, wc, &ps);
+	if ( !s )
+		memset(&ps, 0, sizeof(ps));
+	if ( result == (size_t) -1 )
+		return -1;
+	if ( result == (size_t) -2 )
+		return -1;
+	return (int) result;
 }
diff --git a/libc/wchar/mbrlen.cpp b/libc/wchar/mbrlen.cpp
index d9417009..2589f837 100644
--- a/libc/wchar/mbrlen.cpp
+++ b/libc/wchar/mbrlen.cpp
@@ -1,6 +1,6 @@
 /*******************************************************************************
 
-    Copyright(C) Jonas 'Sortie' Termansen 2013.
+    Copyright(C) Jonas 'Sortie' Termansen 2013, 2014.
 
     This file is part of the Sortix C Library.
 
@@ -22,64 +22,11 @@
 
 *******************************************************************************/
 
-#include <errno.h>
-#include <string.h>
 #include <wchar.h>
 
-static size_t utf8_header_length(unsigned char uc)
-{
-	if ( (uc & 0b11000000) == 0b10000000 )
-		return 0;
-	if ( (uc & 0b10000000) == 0b00000000 )
-		return 1;
-	if ( (uc & 0b11100000) == 0b11000000 )
-		return 2;
-	if ( (uc & 0b11110000) == 0b11100000 )
-		return 3;
-	if ( (uc & 0b11111000) == 0b11110000 )
-		return 4;
-	if ( (uc & 0b11111100) == 0b11111000 )
-		return 5;
-	if ( (uc & 0b11111110) == 0b11111100 )
-		return 6;
-	return (size_t) -1;
-}
-
-// TODO: Use the shift state.
 extern "C"
 size_t mbrlen(const char* restrict s, size_t n, mbstate_t* restrict ps)
 {
-	size_t expected_length;
-
-	for ( size_t i = 0; i < n; i++ )
-	{
-		unsigned char uc = (unsigned char) s[i];
-
-		if ( i == 0 )
-		{
-			if ( !uc )
-			{
-				memset(ps, 0, sizeof(*ps));
-				return 0;
-			}
-
-			if ( (expected_length = utf8_header_length(uc)) == (size_t) -1 )
-				return errno = EILSEQ, (size_t) -1;
-
-			// Check if we encounted an unexpected character claiming to be in
-			// the middle of a UTF-8 multibyte sequence (10xxxxxx).
-			if ( expected_length == 0 )
-				// TODO: Should we play catch up with the partial sequence?
-				return errno = EILSEQ, (size_t) -1;
-		}
-
-		// All non-header bytes should be of the form 10xxxxxx.
-		if ( 0 < i && expected_length < n && (uc & 0b11000000) != 0b10000000 )
-			return errno = EILSEQ, (size_t) -1;
-
-		if ( i + 1 == expected_length )
-			return i + 1;
-	}
-
-	return (size_t) -2;
+	static mbstate_t static_ps;
+	return mbrtowc(NULL, s, n, ps ? ps : &static_ps);
 }
diff --git a/libc/wchar/mbrtowc.cpp b/libc/wchar/mbrtowc.cpp
index 2c82644b..ac663178 100644
--- a/libc/wchar/mbrtowc.cpp
+++ b/libc/wchar/mbrtowc.cpp
@@ -1,6 +1,6 @@
 /*******************************************************************************
 
-    Copyright(C) Jonas 'Sortie' Termansen 2012.
+    Copyright(C) Jonas 'Sortie' Termansen 2012, 2014.
 
     This file is part of the Sortix C Library.
 
@@ -24,82 +24,123 @@
 
 #include <errno.h>
 #include <stdint.h>
+#include <string.h>
 #include <wchar.h>
 
-extern "C"
-size_t mbrtowc(wchar_t* restrict pwc, const char* restrict s, size_t n,
-               mbstate_t* restrict /*ps*/)
+static
+size_t utf8_mbrtowc(wchar_t* restrict pwc,
+                    const char* restrict s,
+                    size_t n,
+                    mbstate_t* restrict ps)
 {
-	if ( !s )
+	size_t i;
+	for ( i = 0; !(i && ps->count == 0); i++ )
 	{
-		// TODO: Restore ps to initial state if currently valid.
-		return 0;
-	}
-	uint8_t* buf = (uint8_t*) s;
-	wchar_t ret = 0;
-	size_t numbytes = 0;
-	size_t sequence_len = 1;
-	while ( numbytes < sequence_len )
-	{
-		if ( numbytes == n )
-		{
-			// TODO: Support restore through the mbstate_t!
+		// Handle the case where we were not able to fully decode a character,
+		// but it is still possible to finish decoding given more bytes.
+		if ( n <= i )
 			return (size_t) -2;
+
+		char c = s[i];
+		unsigned char uc = (unsigned char) c;
+
+		// The initial state is that we expect a leading byte that informs us of
+		// the length of this character sequence. The number of consecutive high
+		// order bits tells us how many bytes make up this character (one
+		// leading byte followed by zero or more continuation bytes).
+		if ( ps->count == 0 )
+		{
+			if ( (uc & 0b10000000) == 0b00000000 ) /* 0xxxxxxx */
+			{
+				ps->length = (ps->count = 0) + 1;
+				ps->wch = (wchar_t) uc & 0b1111111;
+			}
+			else if ( (uc & 0b11100000) == 0b11000000 ) /* 110xxxxx */
+			{
+				ps->length = (ps->count = 1) + 1;
+				ps->wch = (wchar_t) uc & 0b11111;
+			}
+			else if ( (uc & 0b11110000) == 0b11100000 ) /* 1110xxxx */
+			{
+				ps->length = (ps->count = 2) + 1;
+				ps->wch = (wchar_t) uc & 0b1111;
+			}
+			else if ( (uc & 0b11111000) == 0b11110000 ) /* 11110xxx */
+			{
+				ps->length = (ps->count = 3) + 1;
+				ps->wch = (wchar_t) uc & 0b111;
+			}
+#if 0 /* 5-byte and 6-byte sequences are forbidden by RFC 3629 */
+			else if ( (uc & 0b11111100) == 0b11111000 ) /* 111110xx */
+			{
+				ps->length = (ps->count = 4) + 1) + 1;
+				ps->wch = (wchar_t) uc & 0b11;
+			}
+			else if ( (uc & 0b11111110) == 0b11111100 ) /* 1111110x */
+			{
+				ps->length = (ps->count = 5) + 1) + 1;
+				ps->wch = (wchar_t) uc & 0b1;
+			}
+#endif
+			else
+				return errno = EILSEQ, (size_t) -1;
 		}
-		uint8_t b = buf[numbytes++];
 
-		bool is_continuation = b >> (8-2) == 0b10;
-		if ( 1 == numbytes && is_continuation )
-			return errno = EILSEQ, (size_t) -1;
-		if ( 2 <= numbytes && !is_continuation )
-			return errno = EILSEQ, (size_t) -1;
-
-		wchar_t new_bits;
-		size_t new_bits_num;
-		if ( b >> (8-1) == 0b0 )
-			new_bits = b & 0b01111111,
-			new_bits_num = 7,
-			sequence_len = 1;
-		else if ( b >> (8-2) == 0b10 )
-			new_bits = b & 0b00111111,
-			new_bits_num = 6,
-			sequence_len = 2;
-		else if ( b >> (8-3) == 0b110 )
-			new_bits = b & 0b00011111,
-			new_bits_num = 5,
-			sequence_len = 3;
-		else if ( b >> (8-4) == 0b1110 )
-			new_bits = b & 0b00001111,
-			new_bits_num = 4,
-			sequence_len = 4;
-		else if ( b >> (8-5) == 0b11110 )
-			new_bits = b & 0b00000111,
-			new_bits_num = 3,
-			sequence_len = 5;
-		else if ( b >> (8-6) == 0b111110 )
-			new_bits = b & 0b00000011,
-			new_bits_num = 2,
-			sequence_len = 6;
-		else if ( b >> (8-7) == 0b1111110 )
-			new_bits = b & 0b00000001,
-			new_bits_num = 1,
-			sequence_len = 7;
+		// The secondary state is that following a leading byte, we are
+		// expecting a non-zero number of continuation byte bytes.
 		else
-			return errno = EILSEQ, (size_t) -1;
-		ret = ret >> new_bits_num | new_bits;
+		{
+			// Verify this is a continuation byte.
+			if ( (uc & 0b11000000) != 0b10000000 )
+				return errno = EILSEQ, (size_t) -1;
+			ps->wch = ps->wch << 6 | (uc & 0b00111111);
+			ps->count--;
+		}
 	}
-	if ( !ret )
-	{
-		// TODO: Reset ps to initial state.
-		return 0;
-	}
-	if ( (numbytes == 2 && ret <= 0x007F) ||
-	     (numbytes == 3 && ret <= 0x07FF) ||
-	     (numbytes == 4 && ret <= 0xFFFF) ||
-	     (numbytes == 5 && ret <= 0x1FFFFF) ||
-	     (numbytes == 6 && ret <= 0x3FFFFFF) )
+
+	// Reject the character if it was produced with an overly long sequence.
+	if ( ps->length == 1 && 1 << 7 <= ps->wch )
 		return errno = EILSEQ, (size_t) -1;
+	if ( ps->length == 2 && 1 << (5 + 1 * 6) <= ps->wch )
+		return errno = EILSEQ, (size_t) -1;
+	if ( ps->length == 3 && 1 << (4 + 2 * 6) <= ps->wch )
+		return errno = EILSEQ, (size_t) -1;
+	if ( ps->length == 4 && 1 << (3 + 3 * 6) <= ps->wch )
+		return errno = EILSEQ, (size_t) -1;
+#if 0 /* 5-byte and 6-byte sequences are forbidden by RFC 3629 */
+	if ( ps->length == 5 && 1 << (2 + 4 * 6) <= ps->wch )
+		return errno = EILSEQ, (size_t) -1;
+	if ( ps->length == 6 && 1 << (1 + 5 * 6) <= ps->wch )
+		return errno = EILSEQ, (size_t) -1;
+#endif
+
+	// RFC 3629 limits UTF-8 to 0x0 through 0x10FFFF.
+	if ( 0x10FFFF <= ps->wch )
+		return errno = EILSEQ, (size_t) -1;
+
+	wchar_t result = ps->wch;
+
 	if ( pwc )
-		*pwc = ret;
-	return numbytes;
+		*pwc = result;
+
+	ps->length = 0;
+	ps->wch = 0;
+
+	return result != L'\0' ? i : 0;
+}
+
+extern "C"
+size_t mbrtowc(wchar_t* restrict pwc,
+               const char* restrict s,
+               size_t n,
+               mbstate_t* restrict ps)
+{
+	static mbstate_t static_ps;
+	if ( !ps )
+		ps = &static_ps;
+	if ( !s )
+		s = "", n = 1;
+
+	// TODO: Verify whether the current locale is UTF-8.
+	return utf8_mbrtowc(pwc, s, n, ps);
 }
diff --git a/libc/wchar/mbsinit.cpp b/libc/wchar/mbsinit.cpp
new file mode 100644
index 00000000..1bc8ac51
--- /dev/null
+++ b/libc/wchar/mbsinit.cpp
@@ -0,0 +1,30 @@
+/*******************************************************************************
+
+    Copyright(C) Jonas 'Sortie' Termansen 2014.
+
+    This file is part of the Sortix C Library.
+
+    The Sortix C Library is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Lesser General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or (at your
+    option) any later version.
+
+    The Sortix C Library is distributed in the hope that it will be useful, but
+    WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+    or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+    License for more details.
+
+    You should have received a copy of the GNU Lesser General Public License
+    along with the Sortix C Library. If not, see <http://www.gnu.org/licenses/>.
+
+    wchar/mbsinit.cpp
+    Determine conversion object status.
+
+*******************************************************************************/
+
+#include <wchar.h>
+
+extern "C" int mbsinit(const mbstate_t* ps)
+{
+	return !ps || !ps->count;
+}
diff --git a/libc/wchar/mbsnrtowcs.cpp b/libc/wchar/mbsnrtowcs.cpp
new file mode 100644
index 00000000..53c919e3
--- /dev/null
+++ b/libc/wchar/mbsnrtowcs.cpp
@@ -0,0 +1,82 @@
+/*******************************************************************************
+
+    Copyright(C) Jonas 'Sortie' Termansen 2013, 2014.
+
+    This file is part of the Sortix C Library.
+
+    The Sortix C Library is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Lesser General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or (at your
+    option) any later version.
+
+    The Sortix C Library is distributed in the hope that it will be useful, but
+    WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+    or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+    License for more details.
+
+    You should have received a copy of the GNU Lesser General Public License
+    along with the Sortix C Library. If not, see <http://www.gnu.org/licenses/>.
+
+    wchar/mbsnrtowcs.cpp
+    Convert a multibyte string to a wide-character string.
+
+*******************************************************************************/
+
+#include <assert.h>
+#include <wchar.h>
+
+extern "C"
+size_t mbsnrtowcs(wchar_t* restrict dst,
+                  const char** restrict src_ptr,
+                  size_t src_len,
+                  size_t dst_len,
+                  mbstate_t* restrict ps)
+{
+	static mbstate_t static_ps;
+	if ( !ps )
+		ps = &static_ps;
+
+	assert(src_ptr && *src_ptr);
+	const char* src = *src_ptr;
+
+	// Continue to decode wide characters until we have filled the destination
+	// buffer or if we have exhausted the limit on input multibyte characters.
+	size_t dst_offset = 0;
+	size_t src_offset = 0;
+	while ( (!dst || dst_offset < dst_len) && src_offset < src_len )
+	{
+		mbstate_t ps_copy = *ps;
+		wchar_t wc;
+		size_t amount = mbrtowc(&wc, src + src_offset, src_len - src_offset, ps);
+
+		// Stop in the event a decoding error occured.
+		if ( amount == (size_t) -1 )
+			return *src_ptr = src + src_offset, (size_t) -1;
+
+		// Stop decoding early in the event we encountered a partial character.
+		if ( amount == (size_t) -2 )
+		{
+			*ps = ps_copy;
+			break;
+		}
+
+		// Store the decoded wide character in the destination buffer.
+		if ( dst )
+			dst[dst_offset] = wc;
+
+		// Stop decoding after decoding a null character and return a NULL
+		// source pointer to the caller, not including the null character in the
+		// number of characters stored in the destination buffer.
+		if ( wc == L'\0' )
+		{
+			src = NULL;
+			src_offset = 0;
+			break;
+		}
+
+		dst_offset++;
+		src_offset += amount;
+	}
+
+	return *src_ptr = src + src_offset, dst_offset;
+}
diff --git a/libc/wchar/mbsrtowcs.cpp b/libc/wchar/mbsrtowcs.cpp
index 0808ea89..0289d266 100644
--- a/libc/wchar/mbsrtowcs.cpp
+++ b/libc/wchar/mbsrtowcs.cpp
@@ -1,6 +1,6 @@
 /*******************************************************************************
 
-    Copyright(C) Jonas 'Sortie' Termansen 2013.
+    Copyright(C) Jonas 'Sortie' Termansen 2013, 2014.
 
     This file is part of the Sortix C Library.
 
@@ -24,49 +24,16 @@
 
 #include <assert.h>
 #include <errno.h>
+#include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
 #include <wchar.h>
 
-extern "C" size_t mbsrtowcs(wchar_t* dst, const char** src_ptr, size_t dst_len,
-                            mbstate_t* ps)
+extern "C"
+size_t mbsrtowcs(wchar_t* restrict dst,
+                 const char** restrict src_ptr,
+                 size_t dst_len,
+                 mbstate_t* restrict ps)
 {
-	assert(src_ptr && *src_ptr);
-	// Avoid changing *src_ptr if dst is NULL.
-	const char* local_src_ptr = *src_ptr;
-	if ( !dst )
-		src_ptr = &local_src_ptr;
-	// For some reason, the standards don't mandate that the secret ps variable
-	// is reset when ps is NULL, unlike mbstowcs that always resets this
-	// variable. We'll avoid resetting the variable here in case any programs
-	// actually take advantage of this fact.
-	static mbstate_t static_ps;
-	if ( !ps )
-		ps = &static_ps;
-	size_t ret = 0;
-	size_t src_len = strlen(*src_ptr);
-	while ( !dst || dst_len )
-	{
-		mbstate_t saved_ps = *ps;
-		size_t consumed = mbrtowc(dst, *src_ptr, src_len, ps);
-		if ( consumed == (size_t) 0 )
-		{
-			*src_ptr = NULL;
-			break;
-		}
-		if ( consumed == (size_t) -1 )
-			return (size_t) -1;
-		if ( consumed == (size_t) -2 )
-		{
-			*ps = saved_ps;
-			break;
-		}
-		*src_ptr += consumed;
-		src_len -= consumed;
-		if ( dst )
-			dst++,
-			dst_len--;
-		ret++;
-	}
-	return ret;
+	return mbsnrtowcs(dst, src_ptr, SIZE_MAX, dst_len, ps);
 }
diff --git a/libc/wchar/wcrtomb.cpp b/libc/wchar/wcrtomb.cpp
index ae620d05..32bea61b 100644
--- a/libc/wchar/wcrtomb.cpp
+++ b/libc/wchar/wcrtomb.cpp
@@ -1,6 +1,6 @@
 /*******************************************************************************
 
-    Copyright(C) Jonas 'Sortie' Termansen 2012.
+    Copyright(C) Jonas 'Sortie' Termansen 2012, 2014.
 
     This file is part of the Sortix C Library.
 
@@ -23,58 +23,87 @@
 *******************************************************************************/
 
 #include <errno.h>
-#include <stdint.h>
+#include <stdlib.h>
 #include <wchar.h>
 
-extern "C"
-size_t wcrtomb(char* restrict s, wchar_t wc, mbstate_t* restrict /*ps*/)
+static
+size_t utf8_wcrtomb(char* restrict s, wchar_t wc, mbstate_t* restrict /*ps*/)
 {
-	if ( !wc )
+	// The definition of UTF-8 prohibits encoding character numbers between
+	// U+D800 and U+DFFF, which are reserved for use with the UTF-16 encoding
+	// form (as surrogate pairs) and do not directly represent characters.
+	if ( 0xD800 <= wc && wc <= 0xDFFF )
+		return errno = EILSEQ, (size_t) -1;
+
+	// RFC 3629 limits UTF-8 to 0x0 through 0x10FFFF.
+	if ( 0x10FFFF <= wc )
+		return errno = EILSEQ, (size_t) -1;
+
+	size_t index = 0;
+
+	if ( wc < (1 << (7)) )  /* 0xxxxxxx */
 	{
-		if ( s )
-			*s = '\0';
-		return 1;
+		s[index++] = 0b00000000 | (wc >> 0 & 0b01111111);
+		return index;
 	}
 
-	uint32_t unicode = wc;
-	uint8_t* buf = (uint8_t*) s;
-	unsigned bytes = 1;
-	unsigned bits = 7;
-	if ( (1U<<7U) <= unicode ) { bytes = 2; bits = 11; }
-	if ( (1U<<11U) <= unicode ) { bytes = 3; bits = 16; }
-	if ( (1U<<16U) <= unicode ) { bytes = 4; bits = 21; }
-	if ( (1U<<21U) <= unicode ) { bytes = 5; bits = 26; }
-	if ( (1U<<26U) <= unicode ) { bytes = 6; bits = 31; }
-	if ( (1U<<31U) <= unicode ) { errno = EILSEQ; return (size_t) -1; }
-
-	if ( !s )
-		return bytes;
-
-	uint8_t prefix;
-	unsigned prefixavai;
-	switch ( bytes )
+	if ( wc < (1 << (5 + 1 * 6)) )  /* 110xxxxx 10xxxxxx^1 */
 	{
-	case 1: prefixavai = 7; prefix = 0b0U << prefixavai; break;
-	case 2: prefixavai = 5; prefix = 0b110U << prefixavai; break;
-	case 3: prefixavai = 4; prefix = 0b1110U << prefixavai; break;
-	case 4: prefixavai = 3; prefix = 0b11110U << prefixavai; break;
-	case 5: prefixavai = 2; prefix = 0b111110U << prefixavai; break;
-	case 6: prefixavai = 1; prefix = 0b1111110U << prefixavai; break;
-	default: __builtin_unreachable();
+		s[index++] = 0b11000000 | (wc >> 6 & 0b00011111);
+		s[index++] = 0b10000000 | (wc >> 0 & 0b00111111);
+		return index;
 	}
 
-	// Put the first bits in the unused area of the prefix.
-	prefix |= unicode >> (bits - prefixavai);
-	*buf++ = prefix;
-	unsigned bitsleft = bits - prefixavai;
-
-	while ( bitsleft )
+	if ( wc < (1 << (4 + 2 * 6)) )  /* 1110xxxx 10xxxxxx^2 */
 	{
-		bitsleft -= 6;
-		uint8_t elembits = (unicode>>bitsleft) & ((1U<<6U)-1U);
-		uint8_t elem = (0b10U<<6U) | elembits;
-		*buf++ = elem;
+		s[index++] = 0b11100000 | (wc >> 2*6 & 0b00001111);
+		s[index++] = 0b10000000 | (wc >> 1*6 & 0b00111111);
+		s[index++] = 0b10000000 | (wc >> 0*6 & 0b00111111);
+		return index;
 	}
 
-	return bytes;
+	if ( wc < (1 << (3 + 3 * 6)) )  /* 11110xxx 10xxxxxx^3 */
+	{
+		s[index++] = 0b11110000 | (wc >> 3*6 & 0b00000111);
+		s[index++] = 0b10000000 | (wc >> 2*6 & 0b00111111);
+		s[index++] = 0b10000000 | (wc >> 1*6 & 0b00111111);
+		s[index++] = 0b10000000 | (wc >> 0*6 & 0b00111111);
+		return index;
+	}
+
+#if 0 /* 5-byte and 6-byte sequences are forbidden by RFC 3629 */
+	if ( wc < (1 << (2 + 4 * 6)) )  /* 111110xx 10xxxxxx^4 */
+	{
+		s[index++] = 0b11111000 | (wc >> 4*6 & 0b00000011);
+		s[index++] = 0b10000000 | (wc >> 3*6 & 0b00111111);
+		s[index++] = 0b10000000 | (wc >> 2*6 & 0b00111111);
+		s[index++] = 0b10000000 | (wc >> 1*6 & 0b00111111);
+		s[index++] = 0b10000000 | (wc >> 0*6 & 0b00111111);
+		return index;
+	}
+
+	if ( wc < (1 << (1 + 5 * 6)) )  /* 111110xx 10xxxxxx^5 */
+	{
+		s[index++] = 0b11111100 | (wc >> 5*6 & 0b00000001);
+		s[index++] = 0b10000000 | (wc >> 4*6 & 0b00111111);
+		s[index++] = 0b10000000 | (wc >> 3*6 & 0b00111111);
+		s[index++] = 0b10000000 | (wc >> 2*6 & 0b00111111);
+		s[index++] = 0b10000000 | (wc >> 1*6 & 0b00111111);
+		s[index++] = 0b10000000 | (wc >> 0*6 & 0b00111111);
+		return index;
+	}
+#endif
+
+	return errno = EILSEQ; return (size_t) -1;
+}
+
+extern "C"
+size_t wcrtomb(char* restrict s, wchar_t wc, mbstate_t* restrict ps)
+{
+	char internal_buffer[MB_CUR_MAX];
+	if ( !s )
+		wc = L'\0', s = internal_buffer;
+
+	// TODO: Verify whether the current locale is UTF-8.
+	return utf8_wcrtomb(s, wc, ps);
 }
diff --git a/libc/wchar/wcsnrtombs.cpp b/libc/wchar/wcsnrtombs.cpp
new file mode 100644
index 00000000..0e634955
--- /dev/null
+++ b/libc/wchar/wcsnrtombs.cpp
@@ -0,0 +1,87 @@
+/*******************************************************************************
+
+    Copyright(C) Jonas 'Sortie' Termansen 2013, 2014.
+
+    This file is part of the Sortix C Library.
+
+    The Sortix C Library is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Lesser General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or (at your
+    option) any later version.
+
+    The Sortix C Library is distributed in the hope that it will be useful, but
+    WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+    or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+    License for more details.
+
+    You should have received a copy of the GNU Lesser General Public License
+    along with the Sortix C Library. If not, see <http://www.gnu.org/licenses/>.
+
+    wchar/wcsnrtombs.cpp
+    Convert a wide-character string to multibyte string.
+
+*******************************************************************************/
+
+#include <assert.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+#include <wchar.h>
+
+extern "C"
+size_t wcsnrtombs(char* restrict dst,
+                  const wchar_t** restrict src_ptr,
+                  size_t src_len,
+                  size_t dst_len,
+                  mbstate_t* restrict ps)
+{
+	static mbstate_t static_ps;
+	if ( !ps )
+		ps = &static_ps;
+
+	assert(src_ptr && *src_ptr);
+	const wchar_t* src = *src_ptr;
+
+	// Continue to encode multibyte characters until we have filled the
+	// destination buffer or if we have exhausted the limit on input wide chars.
+	size_t dst_offset = 0;
+	size_t src_offset = 0;
+	while ( (!dst || dst_offset < dst_len) && src_offset < src_len )
+	{
+		mbstate_t ps_copy = *ps;
+		wchar_t wc = src[src_offset];
+		char mb[MB_CUR_MAX];
+		size_t amount = wcrtomb(mb, wc, ps);
+
+		// Stop in the event a decoding error occured.
+		if ( amount == (size_t) -1 )
+			return *src_ptr = src + src_offset, (size_t) -1;
+
+		// Stop decoding early in the event we encountered a partial character,
+		// or that we ran out of space in the destination buffer.
+		if ( amount == (size_t) -2 || (dst && dst_offset - dst_len < amount ) )
+		{
+			*ps = ps_copy;
+			break;
+		}
+
+		// Store the decoded multibyte character in the destination buffer.
+		if ( dst )
+			memcpy(dst + dst_offset, mb, amount);
+
+		// Stop decoding after decoding a null character and return a NULL
+		// source pointer to the caller, not including the null character in the
+		// number of characters stored in the destination buffer.
+		if ( wc == L'\0' )
+		{
+			src = NULL;
+			src_offset = 0;
+			break;
+		}
+
+		dst_offset += amount;
+		src_offset++;
+	}
+
+	return *src_ptr = src + src_offset, dst_offset;
+}
diff --git a/libc/wchar/wcsrtombs.cpp b/libc/wchar/wcsrtombs.cpp
index a616e30f..09c7067d 100644
--- a/libc/wchar/wcsrtombs.cpp
+++ b/libc/wchar/wcsrtombs.cpp
@@ -1,6 +1,6 @@
 /*******************************************************************************
 
-    Copyright(C) Jonas 'Sortie' Termansen 2013.
+    Copyright(C) Jonas 'Sortie' Termansen 2013, 2014.
 
     This file is part of the Sortix C Library.
 
@@ -22,55 +22,14 @@
 
 *******************************************************************************/
 
-#include <assert.h>
-#include <errno.h>
-#include <stdlib.h>
-#include <string.h>
+#include <stdint.h>
 #include <wchar.h>
 
-extern "C" size_t wcsrtombs(char* dst, const wchar_t** src_ptr, size_t dst_len,
-                            mbstate_t* ps)
+extern "C"
+size_t wcsrtombs(char* restrict dst,
+                 const wchar_t** restrict src_ptr,
+                 size_t dst_len,
+                 mbstate_t* ps)
 {
-	assert(src_ptr && *src_ptr);
-	// Avoid changing *src_ptr if dst is NULL.
-	const wchar_t* local_src_ptr = *src_ptr;
-	if ( !dst )
-		src_ptr = &local_src_ptr;
-	// For some reason, the standards don't mandate that the secret ps variable
-	// is reset when ps is NULL, unlike mbstowcs that always resets this
-	// variable. We'll avoid resetting the variable here in case any programs
-	// actually take advantage of this fact.
-	static mbstate_t static_ps;
-	if ( !ps )
-		ps = &static_ps;
-	size_t ret = 0;
-	size_t src_len = wcslen(*src_ptr);
-	char buf[MB_CUR_MAX];
-	while ( !dst || dst_len )
-	{
-		mbstate_t saved_ps = *ps;
-		size_t produced = wcrtomb(buf, **src_ptr, ps);
-		if ( produced == (size_t) -1 )
-			return (size_t) -1;
-		if ( dst && dst_len < produced )
-		{
-			*ps  = saved_ps;
-			break;
-		}
-		memcpy(dst, buf, produced);
-		if ( **src_ptr == L'\0' )
-		{
-			ret += produced - 1; // Don't count the '\0' byte.
-			*src_ptr = NULL;
-			break;
-		}
-		ret += produced;
-		(*src_ptr)++;
-		src_len--;
-		if ( dst )
-			dst += produced,
-			dst_len -= produced;
-		ret++;
-	}
-	return ret;
+	return wcsnrtombs(dst, src_ptr, SIZE_MAX, dst_len, ps);
 }