diff --git a/doc/user-guide b/doc/user-guide
index b135d98f..e31d7618 100644
--- a/doc/user-guide
+++ b/doc/user-guide
@@ -190,6 +190,7 @@ Sortix comes with a number of home-made programs. Here is an overview:
* `sort` - sort lines of text files
* `tail` - display end of file
* `time` - measure program running time
+* `tr` - translate, squeeze and/or delete characters
* `true` - exit with a success status
* `type` - type raw characters directly into the terminal
* `uname` - system information
diff --git a/utils/.gitignore b/utils/.gitignore
index 8c0d8dd8..a09eb63c 100644
--- a/utils/.gitignore
+++ b/utils/.gitignore
@@ -38,6 +38,7 @@ sh
sort
tail
time
+tr
true
type
uname
diff --git a/utils/Makefile b/utils/Makefile
index a209949d..ba46df35 100644
--- a/utils/Makefile
+++ b/utils/Makefile
@@ -54,6 +54,7 @@ sh \
sort \
tail \
time \
+tr \
true \
type \
uname \
diff --git a/utils/tr.cpp b/utils/tr.cpp
new file mode 100644
index 00000000..b97b8e49
--- /dev/null
+++ b/utils/tr.cpp
@@ -0,0 +1,676 @@
+/*******************************************************************************
+
+ Copyright(C) Jonas 'Sortie' Termansen 2014.
+
+ This program is free software: you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by the Free
+ Software Foundation, either version 3 of the License, or (at your option)
+ any later version.
+
+ This program is distributed in the hope that it will be useful, but WITHOUT
+ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ You should have received a copy of the GNU General Public License along with
+ this program. If not, see .
+
+ tr.cpp
+ Translate, squeeze and/or delete characters.
+
+*******************************************************************************/
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+enum construct_type
+{
+ CONSTRUCT_TYPE_NONE,
+ CONSTRUCT_TYPE_CHARACTER,
+ CONSTRUCT_TYPE_CHARACTER_CLASS,
+ CONSTRUCT_TYPE_CHARACTER_RANGE,
+ CONSTRUCT_TYPE_CHARACTER_REPEAT,
+ CONSTRUCT_TYPE_EQUIVALENCE_CLASS,
+};
+
+struct construct_character
+{
+ unsigned char c;
+};
+
+struct construct_character_class
+{
+ int (*ctype)(int);
+};
+
+struct construct_character_range
+{
+ unsigned char from;
+ unsigned char to;
+};
+
+struct construct_character_repeat
+{
+ unsigned char c;
+ size_t repetitions;
+};
+
+struct construct_equivalence_class
+{
+ unsigned char c;
+};
+
+struct construct
+{
+ enum construct_type type;
+ union
+ {
+ struct construct_character character;
+ struct construct_character_class character_class;
+ struct construct_character_range character_range;
+ struct construct_character_repeat character_repeat;
+ struct construct_equivalence_class equivalence_class;
+ };
+};
+
+const char* parse_construct_character(const char* string,
+ struct construct* construct)
+{
+ if ( !string || string[0] == '\0' )
+ {
+ construct->type = CONSTRUCT_TYPE_NONE;
+ return NULL;
+ }
+
+ if ( string[0] == '\\' )
+ {
+ if ( string[0] == '\0' )
+ error(1, 0, "unescaped backslash at end of string");
+
+ if ( '0' <= string[1] && string[1] <= '3' )
+ {
+ unsigned char value = 0;
+ size_t i = 1;
+ value = value * 8 + string[i++] - '0';
+ if ( '0' <= string[i] && string[i] <= '7' )
+ value = value * 8 + string[i++] - '0';
+ if ( '0' <= string[i] && string[i] <= '7' )
+ value = value * 8 + string[i++] - '0';
+ construct->type = CONSTRUCT_TYPE_CHARACTER;
+ construct->character.c = value;
+ return string + i;
+ }
+ else if ( '0' <= string[1] && string[1] <= '7' )
+ {
+ unsigned char value = 0;
+ size_t i = 1;
+ value = value * 8 + string[i++] - '0';
+ if ( '0' <= string[i] && string[i] <= '7' )
+ value = value * 8 + string[i++] - '0';
+ construct->type = CONSTRUCT_TYPE_CHARACTER;
+ construct->character.c = value;
+ return string + i;
+ }
+
+ unsigned char value;
+ switch ( string[1] )
+ {
+ case '\\': value = '\\'; break;
+ case 'a': value = '\a'; break;
+ case 'b': value = '\b'; break;
+ case 'e': value = '\e'; break;
+ case 'f': value = '\f'; break;
+ case 'n': value = '\n'; break;
+ case 'r': value = '\r'; break;
+ case 't': value = '\t'; break;
+ case 'v': value = '\v'; break;
+ default: value = string[1]; break;
+ };
+
+ construct->type = CONSTRUCT_TYPE_CHARACTER;
+ construct->character.c = value;
+ return string + 2;
+ }
+
+ construct->type = CONSTRUCT_TYPE_CHARACTER;
+ construct->character.c = (unsigned char) string[0];
+ return string + 1;
+}
+
+const char* parse_construct(const char* string, struct construct* construct)
+{
+ if ( !string || string[0] == '\0' )
+ {
+ construct->type = CONSTRUCT_TYPE_NONE;
+ return NULL;
+ }
+
+ if ( string[0] == '[' && string[1] == ':' )
+ {
+ size_t start = 2;
+ size_t end = start;
+ while ( !(string[end] == '\0' ||
+ (string[end + 0] == ':' && string[end + 1] == ']')) )
+ end++;
+ if ( string[end] == ':' && string[end + 1] == ']' )
+ {
+ construct->type = CONSTRUCT_TYPE_CHARACTER_CLASS;
+ if ( !strncmp(string, "[:alnum:]", strlen("[:alnum:]")) )
+ construct->character_class.ctype = isalnum;
+ else if ( !strncmp(string, "[:alpha:]", strlen("[:alpha:]")) )
+ construct->character_class.ctype = isalpha;
+ else if ( !strncmp(string, "[:blank:]", strlen("[:blank:]")) )
+ construct->character_class.ctype = isblank;
+ else if ( !strncmp(string, "[:cntrl:]", strlen("[:cntrl:]")) )
+ construct->character_class.ctype = iscntrl;
+ else if ( !strncmp(string, "[:digit:]", strlen("[:digit:]")) )
+ construct->character_class.ctype = isdigit;
+ else if ( !strncmp(string, "[:graph:]", strlen("[:graph:]")) )
+ construct->character_class.ctype = isgraph;
+ else if ( !strncmp(string, "[:lower:]", strlen("[:lower:]")) )
+ construct->character_class.ctype = islower;
+ else if ( !strncmp(string, "[:print:]", strlen("[:print:]")) )
+ construct->character_class.ctype = isprint;
+ else if ( !strncmp(string, "[:punct:]", strlen("[:punct:]")) )
+ construct->character_class.ctype = ispunct;
+ else if ( !strncmp(string, "[:space:]", strlen("[:space:]")) )
+ construct->character_class.ctype = isspace;
+ else if ( !strncmp(string, "[:upper:]", strlen("[:upper:]")) )
+ construct->character_class.ctype = isupper;
+ else if ( !strncmp(string, "[:xdigit:]", strlen("[:xdigit:]")) )
+ construct->character_class.ctype = isxdigit;
+ else
+ {
+ char* class_name = strndup(string + start, end - start);
+ error(1, 0, "invalid character class `%s'", class_name);
+ __builtin_unreachable();
+ }
+ return string + end + 2;
+ }
+ }
+
+ if ( string[0] == '[' && string[1] == '=' )
+ {
+ size_t start = 2;
+ size_t end = start;
+ while ( !(string[end] == '\0' ||
+ (string[end + 0] == '=' && string[end + 1] == ']')) )
+ end++;
+ if ( string[end] == '=' && string[end + 1] == ']' )
+ {
+ struct construct eq_construct;
+ const char* eq_end =
+ parse_construct_character(string + start, &eq_construct);
+ if ( !eq_end )
+ error(1, 0, "malformed equivalence class");
+ if ( eq_end > string + end )
+ error(1, 0, "malformed equivalence class");
+ if ( eq_end[0] != '=' || eq_end[1] != ']' )
+ error(1, 0, "equivalence class operand must be a single character");
+ construct->type = CONSTRUCT_TYPE_EQUIVALENCE_CLASS;
+ construct->equivalence_class.c = eq_construct.character.c;
+ return eq_end + 2;
+ }
+ }
+
+ if ( string[0] == '[' )
+ {
+ struct construct c_construct;
+ const char* c_end = parse_construct_character(string + 1, &c_construct);
+ if ( c_end && c_end[0] == '*' &&
+ c_end[1 + strspn(c_end + 1, "0123456789")] == ']' )
+ {
+ const char* value = c_end + 1;
+ const char* value_end;
+ int value_base = value[0] == '0' ? 8 : 10;
+ unsigned long repetitions = strtoul((char*) value, (char**) &value_end, value_base);
+ assert(value_end[0] == ']');
+ construct->type = CONSTRUCT_TYPE_CHARACTER_REPEAT;
+ construct->character_repeat.c = c_construct.character.c;
+ construct->character_repeat.repetitions = repetitions;
+ return value_end + 1;
+ }
+ }
+
+ struct construct result_construct;
+ const char* result = parse_construct_character(string, &result_construct);
+ if ( result && result[0] == '-' )
+ {
+ struct construct second_construct;
+ const char* second = parse_construct_character(result + 1, &second_construct);
+ if ( second )
+ {
+ construct->type = CONSTRUCT_TYPE_CHARACTER_RANGE;
+ construct->character_range.from = result_construct.character.c;
+ construct->character_range.to = second_construct.character.c;
+ return second;
+ }
+ }
+ return *construct = result_construct, result;
+}
+
+struct construct_iterator
+{
+ struct construct construct;
+ struct construct* counterpart;
+ const char* string;
+ unsigned char character_counter;
+ size_t counter;
+ bool loop_done;
+};
+
+bool iterate_constructs(struct construct_iterator* iter, unsigned char* out)
+{
+iterate_retry:
+ switch ( iter->construct.type )
+ {
+ case CONSTRUCT_TYPE_CHARACTER_CLASS:
+ while ( !iter->loop_done )
+ {
+ unsigned char c = iter->character_counter;
+ if ( iter->character_counter == UCHAR_MAX )
+ iter->loop_done = true;
+ else
+ iter->character_counter++;
+ if ( iter->construct.character_class.ctype(c) )
+ return *out = c, true;
+ }
+ break;
+ case CONSTRUCT_TYPE_CHARACTER_RANGE:
+ if ( iter->loop_done )
+ break;
+ *out = iter->character_counter;
+ if ( iter->character_counter == iter->construct.character_range.to )
+ iter->loop_done = true;
+ else if ( iter->character_counter < iter->construct.character_range.to )
+ iter->character_counter++;
+ else
+ iter->character_counter--;
+ return true;
+ case CONSTRUCT_TYPE_CHARACTER_REPEAT:
+ while ( iter->counter < iter->construct.character_repeat.repetitions )
+ {
+ iter->counter++;
+ return *out = iter->construct.character_repeat.c, true;
+ }
+ break;
+ case CONSTRUCT_TYPE_NONE:
+ case CONSTRUCT_TYPE_CHARACTER:
+ case CONSTRUCT_TYPE_EQUIVALENCE_CLASS:
+ break;
+ }
+
+ if ( !(iter->string = parse_construct(iter->string, &iter->construct)) )
+ return false;
+
+ switch ( iter->construct.type )
+ {
+ case CONSTRUCT_TYPE_NONE:
+ __builtin_unreachable();
+ case CONSTRUCT_TYPE_CHARACTER:
+ return *out = iter->construct.character.c, true;
+ case CONSTRUCT_TYPE_CHARACTER_CLASS:
+ iter->character_counter = 0;
+ iter->loop_done = false;
+ goto iterate_retry;
+ case CONSTRUCT_TYPE_CHARACTER_RANGE:
+ iter->character_counter = iter->construct.character_range.from;
+ iter->loop_done = false;
+ goto iterate_retry;
+ case CONSTRUCT_TYPE_CHARACTER_REPEAT:
+ if ( iter->counterpart && !iter->construct.character_repeat.repetitions )
+ {
+ struct construct* counterpart = iter->counterpart;
+ size_t repetitions = 0;
+ switch ( iter->counterpart->type )
+ {
+ case CONSTRUCT_TYPE_NONE:
+ repetitions = 0;
+ break;
+ case CONSTRUCT_TYPE_CHARACTER:
+ repetitions = 1;
+ break;
+ case CONSTRUCT_TYPE_CHARACTER_CLASS:
+ repetitions = 0;
+ for ( unsigned char c = 0; true; c++ )
+ {
+ if ( counterpart->character_class.ctype(c) )
+ repetitions++;
+ if ( c == UCHAR_MAX )
+ break;
+ }
+ break;
+ case CONSTRUCT_TYPE_CHARACTER_REPEAT:
+ repetitions = counterpart->character_repeat.repetitions;
+ break;
+ case CONSTRUCT_TYPE_CHARACTER_RANGE:
+ repetitions = 0;
+ for ( unsigned char c = counterpart->character_range.from; true; )
+ {
+ repetitions++;
+ if ( c < counterpart->character_range.to )
+ c++;
+ if ( counterpart->character_range.to < c )
+ c--;
+ if ( c == counterpart->character_range.to )
+ break;
+ }
+ break;
+ case CONSTRUCT_TYPE_EQUIVALENCE_CLASS:
+ repetitions = 1;
+ break;
+ }
+ iter->construct.character_repeat.repetitions = repetitions;
+ }
+ goto iterate_retry;
+ case CONSTRUCT_TYPE_EQUIVALENCE_CLASS:
+ return *out = iter->construct.equivalence_class.c, true;
+ }
+ __builtin_unreachable();
+}
+
+struct construct_iterator_repeat
+{
+ struct construct_iterator iterator;
+ unsigned char last_c;
+ bool has_last_c;
+};
+
+unsigned char iterate_constructs_repeat(struct construct_iterator_repeat* iter)
+{
+ unsigned char c;
+ if ( !iterate_constructs(&iter->iterator, &c) )
+ {
+ if ( !iter->has_last_c )
+ error(1, 0, "when not truncating set1, string2 must be non-empty");
+ c = iter->last_c;
+ }
+ return iter->has_last_c = true, iter->last_c = c;
+}
+
+void calculate_translator(unsigned char translator[UCHAR_MAX],
+ const char* string_1,
+ const char* string_2)
+{
+ for ( unsigned char i = 0; true; i++ )
+ {
+ translator[i] = i;
+ if ( i == UCHAR_MAX )
+ break;
+ }
+
+ struct construct_iterator s1i;
+ memset(&s1i, 0, sizeof(s1i));
+ s1i.construct.type = CONSTRUCT_TYPE_NONE;
+ s1i.string = string_1;
+
+ struct construct_iterator_repeat s2i;
+ memset(&s2i, 0, sizeof(s2i));
+ s2i.iterator.construct.type = CONSTRUCT_TYPE_NONE;
+ s2i.iterator.string = string_2;
+ s2i.iterator.counterpart = &s1i.construct;
+
+ unsigned char c1;
+ while ( iterate_constructs(&s1i, &c1) )
+ {
+ unsigned char c2 = iterate_constructs_repeat(&s2i);
+ translator[c1] = c2;
+ }
+}
+
+void calculate_translator_complement(unsigned char translator[UCHAR_MAX],
+ const char* string_1,
+ const char* string_2)
+{
+ for ( unsigned char i = 0; true; i++ )
+ {
+ translator[i] = i;
+ if ( i == UCHAR_MAX )
+ break;
+ }
+
+ bool s1_members[UCHAR_MAX];
+ memset(&s1_members, 0, sizeof(s1_members));
+
+ struct construct_iterator s1i;
+ memset(&s1i, 0, sizeof(s1i));
+ s1i.construct.type = CONSTRUCT_TYPE_NONE;
+ s1i.string = string_1;
+
+ unsigned char c1;
+ while ( iterate_constructs(&s1i, &c1) )
+ s1_members[c1] = true;
+
+ struct construct_iterator_repeat s2i;
+ memset(&s2i, 0, sizeof(s2i));
+ s2i.iterator.construct.type = CONSTRUCT_TYPE_NONE;
+ s2i.iterator.string = string_2;
+
+ for ( unsigned char i = 0; true; i++ )
+ {
+ if ( !s1_members[i] )
+ translator[i] = iterate_constructs_repeat(&s2i);
+ if ( i == UCHAR_MAX )
+ break;
+ }
+}
+
+void calculate_character_set(bool deletes[UCHAR_MAX],
+ const char* string_1)
+{
+ for ( unsigned char i = 0; true; i++ )
+ {
+ deletes[i] = false;
+ if ( i == UCHAR_MAX )
+ break;
+ }
+
+ struct construct_iterator s1i;
+ memset(&s1i, 0, sizeof(s1i));
+ s1i.construct.type = CONSTRUCT_TYPE_NONE;
+ s1i.string = string_1;
+
+ unsigned char c1;
+ while ( iterate_constructs(&s1i, &c1) )
+ deletes[c1] = true;
+}
+
+void calculate_character_set_complement(bool deletes[UCHAR_MAX],
+ const char* string_1)
+{
+ calculate_character_set(deletes, string_1);
+
+ for ( unsigned char i = 0; true; i++ )
+ {
+ deletes[i] = !deletes[i];
+ if ( i == UCHAR_MAX )
+ break;
+ }
+}
+
+static void compact_arguments(int* argc, char*** argv)
+{
+ for ( int i = 0; i < *argc; i++ )
+ {
+ while ( i < *argc && !(*argv)[i] )
+ {
+ for ( int n = i; n < *argc; n++ )
+ (*argv)[n] = (*argv)[n+1];
+ (*argc)--;
+ }
+ }
+}
+
+static void help(FILE* fp, const char* argv0)
+{
+ fprintf(fp, "Usage: %s tr [OPTION]... SET1 [SET2]\n", argv0);
+}
+
+static void version(FILE* fp, const char* argv0)
+{
+ fprintf(fp, "%s (Sortix) %s\n", argv0, VERSIONSTR);
+ fprintf(fp, "License GPLv3+: GNU GPL version 3 or later .\n");
+ fprintf(fp, "This is free software: you are free to change and redistribute it.\n");
+ fprintf(fp, "There is NO WARRANTY, to the extent permitted by law.\n");
+}
+
+int main(int argc, char* argv[])
+{
+ setlocale(LC_ALL, "");
+
+ bool flag_complement = false;
+ bool flag_delete = false;
+ bool flag_squeeze = false;
+
+ const char* argv0 = argv[0];
+ for ( int i = 1; i < argc; i++ )
+ {
+ const char* arg = argv[i];
+ if ( arg[0] != '-' || !arg[1] )
+ continue;
+ argv[i] = NULL;
+ if ( !strcmp(arg, "--") )
+ break;
+ if ( arg[1] != '-' )
+ {
+ while ( char c = *++arg ) switch ( c )
+ {
+ case 'c': flag_complement = true; break;
+ case 'C': flag_complement = true; break;
+ case 'd': flag_delete = true; break;
+ case 's': flag_squeeze = true; break;
+ default:
+ fprintf(stderr, "%s: unknown option -- '%c'\n", argv0, c);
+ help(stderr, argv0);
+ exit(1);
+ }
+ }
+ else if ( !strcmp(arg, "--help") )
+ help(stdout, argv0), exit(0);
+ else if ( !strcmp(arg, "--version") )
+ version(stdout, argv0), exit(0);
+ else
+ {
+ fprintf(stderr, "%s: unknown option: %s\n", argv0, arg);
+ help(stderr, argv0);
+ exit(1);
+ }
+ }
+
+ compact_arguments(&argc, &argv);
+
+ (void) flag_delete;
+ (void) flag_squeeze;
+
+ if ( argc <= 1 )
+ error(1, 0, "missing operand");
+ const char* string_1 = argv[1];
+
+ bool deletes[UCHAR_MAX];
+ bool squeezes[UCHAR_MAX];
+ unsigned char translator[UCHAR_MAX];
+ for ( unsigned char i = 0; true; i++ )
+ {
+ deletes[i] = false;
+ squeezes[i] = false;
+ translator[i] = i;
+ if ( i == UCHAR_MAX )
+ break;
+ }
+
+ if ( flag_delete && flag_squeeze )
+ {
+ if ( argc <= 2 )
+ error(1, 0, "missing operand after `%s'", string_1);
+ const char* string_2 = argv[2];
+
+ if ( 4 <= argc )
+ error(1, 0, "extra operand `%s'", argv[3]);
+
+ if ( flag_complement )
+ calculate_character_set_complement(deletes, string_1);
+ else
+ calculate_character_set(deletes, string_1);
+
+ calculate_character_set(squeezes, string_2);
+
+ }
+ else if ( flag_delete && !flag_squeeze )
+ {
+ if ( 3 <= argc )
+ error(1, 0, "extra operand `%s'", argv[3]);
+
+ bool deletes[UCHAR_MAX];
+ if ( flag_complement )
+ calculate_character_set_complement(deletes, string_1);
+ else
+ calculate_character_set(deletes, string_1);
+ }
+ else if ( !flag_delete && flag_squeeze )
+ {
+ if ( argc == 2 )
+ {
+ if ( flag_complement )
+ calculate_character_set_complement(squeezes, string_1);
+ else
+ calculate_character_set(squeezes, string_1);
+ }
+ else if ( argc == 3 )
+ {
+ const char* string_2 = argv[2];
+
+ if ( flag_complement )
+ calculate_translator_complement(translator, string_1, string_2);
+ else
+ calculate_translator(translator, string_1, string_2);
+
+ calculate_character_set(squeezes, string_2);
+ }
+ else if ( 4 <= argc )
+ {
+ error(1, 0, "extra operand `%s'", argv[3]);
+ }
+ }
+ else if ( !flag_delete && !flag_squeeze )
+ {
+ if ( argc <= 2 )
+ error(1, 0, "missing operand after `%s'", string_1);
+ const char* string_2 = argv[2];
+
+ if ( 4 <= argc )
+ error(1, 0, "extra operand `%s'", argv[3]);
+
+ if ( flag_complement )
+ calculate_translator_complement(translator, string_1, string_2);
+ else
+ calculate_translator(translator, string_1, string_2);
+ }
+
+ int last_ic = EOF;
+ int ic;
+ while ( (ic = getchar()) != EOF )
+ {
+ ic = (int) translator[ic];
+ if ( squeezes[(unsigned char) ic] && ic == last_ic )
+ continue;
+ if ( !deletes[(unsigned char) ic] )
+ {
+ putchar(ic);
+ last_ic = ic;
+ }
+ }
+
+ if ( ferror(stdin) )
+ error(1, 0, "stdin");
+ if ( ferror(stdout) || fflush(stdout) == EOF )
+ error(1, 0, "stdout");
+
+ return 0;
+}