1 files changed, 244 insertions, 20 deletions
diff --git a/bkt b/bkt
index 42ea6ea..b6668f2 100755
--- a/bkt
+++ b/bkt
@@ -1,10 +1,12 @@
-#!/usr/bin/perl -w
+#!/usr/bin/perl
 
+use warnings;
 use Getopt::Std;
+use open ":locale";
 $Getopt::Std::STANDARD_HELP_VERSION++;
 
 ($SELF = $0) =~ s,.*/,,;
-$VERSION="0.0.0";
+$VERSION="0.0.1";
 
 sub HELP_MESSAGE {
 	print <<EOF;
@@ -26,19 +28,73 @@ foo	2	40.0%
 
 Usage: $SELF <options> <file> ...
 
-Options are:
+General options:
   --help
   -h        display this help message
+
+Output options:
   -c        show counts only (suppress percentages)
   -p        show percentages only (suppress counts)
+  -t        show total count
+  -x        print output in hexadecimal
+  -s opts   output sort options. opts may include:
+            r - reverse sort
+            f - fold case
+            c - sort by count rather than input string
+  -T thresh filter out results below threshold (which may be a
+            count or a percentage, e.g. 5%).
+  -P        don't pad output with spaces to length of longest element
+            -o option enables this as well.
+
+Input options:
+  -B        binary mode (default: input is characters in current locale)
+  -o string use string as output delimiter (default: \\t). implies -P.
+  -b range  consider only a range of chars/bytes in each line (e.g. 1-3)
+  -d delim  split on delimiter (default: /\\s+/ aka whitespace)
+  -f field  consider only this (delimiter-separated) field
+  -i        case insensitive (actually, lowercases all input)
   -w        remove leading and trailing whitespace from input lines
   -W        remove ALL whitespace from input lines
-  -t        show total count
   -e code   execute perl code for each line of input (should modify \$_,
             make sure you quote the argument as needed by your shell)
 
 Input will be read from filenames given on the command line, or from
-standard input if none given. The input need not be sorted.
+standard input if none given. The input need not be sorted. The output
+will always be sorted.
+
+Each line of input is chomped (has trailing \\n removed) before any
+further processing.
+
+-b is like the -b or -c option to cut(1) (depending on whether -B is
+set). It supports the same type of range as cut(1):
+
+N      N'th byte/character, counted from 1
+N-     from N'th byte/character to end of line
+N-M    from N'th to M'th (included) byte/character
+-M     from first to M'th (included) byte/character
+
+...except that cut allows many ranges separated by commas, while $SELF
+-b only allows a single range.
+
+-d is like the the -d option to cut(1), except that the delimiter can be
+multiple characters. Also, the delimiter is treated as a regular expression
+if it's at least 4 characters long *and* enclosed in //.
+
+-f like cut's -f, except that it only allows a single field number (not
+a list), which is indexed starting from 1 (same as cut)... or a negative
+number, meaning the Nth field from the right (-1 = rightmost). Also
+unlike cut, -f and -b may be combined (-b is applied first).
+
+The -b -f -i -w -W -e<code> options will be processed in the order
+listed here, regardless of the order they're given on the command
+line. In particular, this means the code for -e will see \$_ *after*
+it's been modified by any of the other options.
+
+There can only be one -e option, but it may be multiple lines of code
+separated with semicolons (like perl's own -e option). When the -e code
+runs, \$_ contains the input (possibly tranformed by other options),
+and can be modified arbitratily. The -e code can filter out unwanted
+lines by executing "next", which will cause them to be skipped entirely.
 EOF
 }
 
@@ -46,31 +102,172 @@ sub VERSION_MESSAGE {
 	print "$SELF $VERSION\n";
 }
 
-getopts('hcpwWte:', \%opt);
+sub hexify {
+	my @out = ();
+	push @out, sprintf("%x", ord($_)) for split "", $_[0];
+	return join(" ", @out);
+}
+
+# main()
+getopts('hcpiwWte:d:f:b:xo:Bs:T:P', \%opt);
 
 HELP_MESSAGE() if $opt{h};
 
-die "$SELF: can't give -c and -p options together\n"
+die "$SELF: options -c and -p are mutually exclusive\n"
 	if $opt{c} && $opt{p};
 
+if(defined $opt{o}) {
+	$opt{P} = 1;
+} else {
+	$opt{o} = "\t";
+}
+
+if(defined $opt{d}) {
+	if($opt{d} =~ m|^/(.{2,})/$|) {
+		$opt{d} = qr/$1/;
+	} else {
+		$opt{d} = quotemeta($opt{d});
+		$opt{d} = qr/$opt{d}/;
+	}
+} else {
+	$opt{d} = qr/\s+/;
+}
+
+if(defined $opt{b}) {
+	for($opt{b}) {
+		/^(\d+)$/ && do { $substrarg = "$1 - 1, 1" };
+		/^(\d+)-$/ && do { $substrarg = "$1 - 1" };
+		/^-(\d+)$/ && do { $substrarg = "0, $1" };
+		/^(\d+)-(\d+)$/ && do { $substrarg = "$1 - 1, " . ($2 - $1 + 1) };
+	}
+	die "$SELF: invalid -b argument\n" unless $substrarg;
+}
+
+# -f index starts at 1, perl arrays are indexed from 0, fix (but
+# don't break using -1 for rightmost field)
+$opt{f}-- if defined $opt{f} && $opt{f} > 0;
+
+if($opt{s}) {
+	for(split "", $opt{s}) {
+		/r/ && do { $revsort++;   };
+		/f/ && do { $foldsort++;  };
+		/c/ && do { $countsort++; };
+		/([^rfc])/ && do {
+			warn "$SELF: ignoring unknown sort option '$1'\n"; };
+	}
+}
+
+if($countsort && $foldsort) {
+	die "$SELF: sort options 'c' and 'f' are mutually exclusive\n";
+}
+
+if($countsort) {
+	$sortcode = "{ " .
+		'$counts{' . ($revsort ? '$b' : '$a') . '}' .
+		" <=> " .
+		'$counts{' . ($revsort ? '$a' : '$b') . '}' .
+		"}";
+} else {
+	$sortcode = "{ " .
+		($foldsort ? 'lc ' : "" ) .
+		($revsort ? '$b' : '$a') .
+		" cmp " .
+		($foldsort ? 'lc ' : "" ) .
+		($revsort ? '$a' : '$b') .
+		"}";
+}
+
+# finally done with option processing, let the main event commence.
+
 %counts = ();
 $total = 0;
+$longest = 0;
+$readfiles = 0;
+$badfiles = 0;
+
+# Sadly, we can't use the magical while(<>) here to automatically iterate
+# and open all the files in @ARGV, because of the locale stuff. We need to
+# call binmode() on each filehandle after it's opened, but before anything
+# gets read from it. 'use open ":bytes"' would set the default binmode,
+# but I couldn't get it to work conditionally (not even with eval).
+
+$ARGV[0] = '-' unless @ARGV;
+for(@ARGV) {
+	my $fh;
+
+	if($_ eq '-') {
+		$fh = \*STDIN;
+	} else {
+		open $fh, '<', $_ or do {
+			warn "$SELF: $_: $!\n";
+			$badfiles++;
+			next;
+		};
+	}
+
+	binmode $fh, ":bytes" if $opt{B};
+	$readfiles++;
+
+	while(<$fh>) {
+		chomp;
+
+		# behave like cut for -b/-f: no warnings if -f3 but only 2 fields exist,
+		# or -b10 but only 9 characters exist.
+
+		if($substrarg) { # set via $opt{b}
+			no warnings qw/substr/;
+			eval "\$_ = substr(\$_, $substrarg)";
+			$_ = "" unless defined $_;
+		}
+
+		if($opt{f}) {
+			$_ = (split($opt{d}))[$opt{f}];
+			$_ = "" unless defined $_;
+		}
+
+		$_ = lc if $opt{i};
+		s/^\s+|\s+$//g if $opt{w};
+		s/\s//g if $opt{W};
+
+		if($opt{e}) {
+			no warnings qw/exiting/; # so -e code can "next" to skip a line
+			eval $opt{e};
+			die "$SELF: $@" if $@;
+		}
 
-while(<>) {
-	chomp;
-	if($opt{e}) {
-		eval $opt{e};
-		die "$SELF: $@" if $@;
+		$counts{$_}++;
+		$total++;
 	}
-	s/^\s+|\s+$//g if $opt{w};
-	s/\s//g if $opt{W};
-	$counts{$_}++;
-	$total++;
 }
 
-for(sort keys %counts) {
-	print $_ . "\t";
-	print $counts{$_} . "\t" unless $opt{p};
+die "$SELF: couldn't read any input files\n" unless $readfiles;
+
+if($opt{T}) {
+	(my ($thresh, $pct)) = ($opt{T} =~ /^(\d+)(%?)/);
+	if($thresh) {
+		for(keys %counts) {
+			delete $counts{$_} if
+				($pct && (($counts{$_} * 100 / $total) < $thresh)) ||
+				(!$pct && ($counts{$_} < $thresh));
+		}
+	} else {
+		warn "$SELF: invalid argument for -T\n";
+	}
+}
+
+if(!$opt{P}) {
+	for(keys %counts) {
+		my $l = length($opt{x} ? hexify($_) : $_);
+		$longest = $l if $longest < $l;
+	}
+}
+
+# done reading & counting all input, show the results.
+for(sort { eval $sortcode } keys %counts) {
+	print ($opt{x} ? hexify($_) : $_);
+	print " " x ($longest - length) unless $opt{P};
+	print $opt{o};
+	print $counts{$_} . $opt{o} unless $opt{p};
 	printf "%.1f%%", ($counts{$_} * 100 / $total) unless $opt{c};
 	print "\n";
 }
@@ -79,4 +276,31 @@ if($opt{t}) {
 	print "\n-- Total count: $total\n";
 }
 
-exit(0);
+# be like cat, exit with error status if any input file couldn't be
+# read (even if we did successfully read others)
+exit($badfiles != 0);
+
+__END__
+
+Examples:
+
+# show the percentage of binaries that start with each letter/number/etc,
+# 4 different ways
+cd /usr/bin
+ls | bkt -b1
+ls | cut -b1 | bkt
+ls | bkt -e '$_=substr($_,0,1)'
+ls | bkt -e 's,^(.).*,$1,'
+
+# show percentages of stuff said by each user in an irssi IRC log. relies
+# on the log format having a timestamp, space, <nick> for normal lines.
+# misses /me actions entirely though.
+# add -src to show the most talkative first.
+bkt -f2 -e 'next unless /^\</' channelname.log
+
+# show us how many users use each shell (including stuff like /bin/false).
+bkt -d: -f-1 /etc/passwd
+
+# how many images of each type have we got? ignore case, so JPG and jpg
+# are counted together.
+ls ~/images/*.* | bkt -d. -f-1