]> git.sesse.net Git - plocate/commitdiff
Escape unprintable characters when outputting filenames to a terminal.
authorSteinar H. Gunderson <steinar+git@gunderson.no>
Thu, 29 Oct 2020 22:42:01 +0000 (23:42 +0100)
committerSteinar H. Gunderson <steinar+git@gunderson.no>
Thu, 29 Oct 2020 22:42:01 +0000 (23:42 +0100)
Filenames are generally untrusted, and can contain any kind of cruft.
In particular, there have been terminals (hopefully not in wide use anymore!)
that will do insanity like running specific commands when seeing a
specific escape sequence. More prosaically, embedded newlines can
make for confusing output.

Thus, escape any nonprintable characters in a shell-parseable way,
much the same way GNU ls does these days. Also escape quotes, backslashes
and the likes to make sure nothing unescaped looks like it's escaped.
This doesn't mean it's safe to take whatever and parse it uncritically
(we don't escape $, for instance), but it's generally good enough.

Escaping is disabled when doing zero-terminated output, or when printing
to a pipe or file.

options.h
plocate.cpp
serializer.cpp

index 68f712e843e487688667531fe183e1b2ca6742ac..5dc5ad4c55ee543a75d13e77a66b1743db277da7 100644 (file)
--- a/options.h
+++ b/options.h
@@ -13,5 +13,6 @@ extern bool patterns_are_regex;
 extern bool use_extended_regex;
 extern int64_t limit_matches;
 extern int64_t limit_left;  // Not strictly an option.
+extern bool stdout_is_tty;  // Same.
 
 #endif  // !defined(_OPTIONS_H)
index c06a28bf2dd24a233c7d8a2726822bc23cd6d8cc..423aaf7e632f408bcdedcb48163c1918e7a7510b 100644 (file)
@@ -55,6 +55,7 @@ bool use_extended_regex = false;
 bool match_basename = false;
 int64_t limit_matches = numeric_limits<int64_t>::max();
 int64_t limit_left = numeric_limits<int64_t>::max();
+bool stdout_is_tty = false;
 
 steady_clock::time_point start;
 ZSTD_DDict *ddict = nullptr;
@@ -767,6 +768,10 @@ int main(int argc, char **argv)
                }
        }
 
+       if (!print_nul) {
+               stdout_is_tty = isatty(1);
+       }
+
        vector<Needle> needles;
        for (int i = optind; i < argc; ++i) {
                Needle needle;
index 10a156411b5e551cec7e51be66907fef545a80c0..f20c304de54e690a5c41852127da93b6b8c3c2bb 100644 (file)
@@ -27,6 +27,121 @@ void apply_limit()
        exit(0);
 }
 
+void print_possibly_escaped(const string &str)
+{
+       if (print_nul) {
+               printf("%s%c", str.c_str(), 0);
+               return;
+       } else if (!stdout_is_tty) {
+               printf("%s\n", str.c_str());
+               return;
+       }
+
+       // stdout is a terminal, so we should protect the user against
+       // escapes, stray newlines and the likes. First of all, check if
+       // all the characters are safe; we consider everything safe that
+       // isn't a control character, ', " or \. People could make
+       // filenames like "$(rm -rf)", but that's out-of-scope.
+       const char *ptr = str.data();
+       size_t len = str.size();
+
+       mbtowc(nullptr, 0, 0);
+       wchar_t pwc;
+       bool all_safe = true;
+       do {
+               int ret = mbtowc(&pwc, ptr, len);
+               if (ret == -1) {
+                       all_safe = false;  // Malformed data.
+               } else if (ret == 0) {
+                       break;  // EOF.
+               } else if (pwc < 32 || pwc == '\'' || pwc == '"' || pwc == '\\') {
+                       all_safe = false;
+               } else {
+                       ptr += ret;
+                       len -= ret;
+               }
+       } while (all_safe);
+
+       if (all_safe) {
+               printf("%s\n", str.c_str());
+               return;
+       }
+
+       // Print escaped, but in such a way that the user can easily take the
+       // escaped output and paste into the shell. We print much like GNU ls does,
+       // ie., using the shell $'foo' construct whenever we need to print something
+       // escaped.
+       bool in_escaped_mode = false;
+       printf("'");
+
+       mbtowc(nullptr, 0, 0);
+       ptr = str.data();
+       len = str.size();
+       for (;;) {
+               int ret = mbtowc(nullptr, ptr, len);
+               if (ret == -1) {
+                       // Malformed data.
+                       printf("?");
+                       ++ptr;
+                       --len;
+               } else if (ret == 0) {
+                       break;  // EOF.
+               }
+               if (*ptr < 32 || *ptr == '\'' || *ptr == '"' || *ptr == '\\') {
+                       if (!in_escaped_mode) {
+                               printf("'$'");
+                               in_escaped_mode = true;
+                       }
+
+                       // The list of allowed escapes is from bash(1).
+                       switch (*ptr) {
+                       case '\a':
+                               printf("\\a");
+                               break;
+                       case '\b':
+                               printf("\\b");
+                               break;
+                       case '\f':
+                               printf("\\f");
+                               break;
+                       case '\n':
+                               printf("\\n");
+                               break;
+                       case '\r':
+                               printf("\\r");
+                               break;
+                       case '\t':
+                               printf("\\t");
+                               break;
+                       case '\v':
+                               printf("\\v");
+                               break;
+                       case '\\':
+                               printf("\\\\");
+                               break;
+                       case '\'':
+                               printf("\\'");
+                               break;
+                       case '"':
+                               printf("\\\"");
+                               break;
+                       default:
+                               printf("\\%03o", *ptr);
+                               break;
+                       }
+               } else {
+                       if (in_escaped_mode) {
+                               printf("''");
+                               in_escaped_mode = false;
+                       }
+                       fwrite(ptr, ret, 1, stdout);
+               }
+               ptr += ret;
+               len -= ret;
+       }
+       printf("'\n");
+}
+
 void Serializer::print(uint64_t seq, uint64_t skip, const string msg)
 {
        if (only_count) {
@@ -42,11 +157,7 @@ void Serializer::print(uint64_t seq, uint64_t skip, const string msg)
        }
 
        if (!msg.empty()) {
-               if (print_nul) {
-                       printf("%s%c", msg.c_str(), 0);
-               } else {
-                       printf("%s\n", msg.c_str());
-               }
+               print_possibly_escaped(msg);
                apply_limit();
        }
        next_seq += skip;
@@ -54,11 +165,7 @@ void Serializer::print(uint64_t seq, uint64_t skip, const string msg)
        // See if any delayed prints can now be dealt with.
        while (!pending.empty() && pending.top().seq == next_seq) {
                if (!pending.top().msg.empty()) {
-                       if (print_nul) {
-                               printf("%s%c", pending.top().msg.c_str(), 0);
-                       } else {
-                               printf("%s\n", pending.top().msg.c_str());
-                       }
+                       print_possibly_escaped(pending.top().msg);
                        apply_limit();
                }
                next_seq += pending.top().skip;