1 files changed, 121 insertions, 66 deletions
diff --git a/bkt b/bkt
index b6668f2..0d67366 100755
--- a/bkt
+++ b/bkt
@@ -1,17 +1,24 @@
 #!/usr/bin/perl
 
 use warnings;
-use Getopt::Std;
+no warnings 'surrogate'; ## doesn't work
+use strict;
+
 use open ":locale";
+
+use Getopt::Std;
+# this makes getopts exit after --help:
 $Getopt::Std::STANDARD_HELP_VERSION++;
 
-($SELF = $0) =~ s,.*/,,;
-$VERSION="0.0.1";
+(our $SELF = $0) =~ s,.*/,,;
+our $VERSION="0.0.1";
 
 sub HELP_MESSAGE {
 	print <<EOF;
 $SELF - count repeats in input
 
+Usage: $SELF <options> <file> ...
+
 Given the following input:
 
 foo
@@ -26,75 +33,101 @@ bar	2	40.0%
 baz	1	20.0%
 foo	2	40.0%
 
-Usage: $SELF <options> <file> ...
+The name 'bkt' comes from the concept of collecting like items in buckets
+(this is basically how a hashtable works). The original plan was to name
+this script 'bucketize', but who wants to type all that? Also, purely
+to support lazy typists, $SELF implements subsets of the functionality
+of cut(1) and sort(1).
 
 General options:
   --help
   -h        display this help message
+  --version display '$SELF $VERSION'
+  --        end of options; everything after this is treated as a filename
 
 Output options:
   -c        show counts only (suppress percentages)
   -p        show percentages only (suppress counts)
+            -c and -p may be combined, if you can find a use for it
   -t        show total count
   -x        print output in hexadecimal
   -s opts   output sort options. opts may include:
-            r - reverse sort
-            f - fold case
-            c - sort by count rather than input string
+            r - reverse sort (default is ascending)
+            a - sort alphabetically (default is by count)
+            f - sort alphabetically, folding case
   -T thresh filter out results below threshold (which may be a
             count or a percentage, e.g. 5%).
+  -o string use string as output delimiter (default: \\t). implies -P.
   -P        don't pad output with spaces to length of longest element
             -o option enables this as well.
 
 Input options:
   -B        binary mode (default: input is characters in current locale)
-  -o string use string as output delimiter (default: \\t). implies -P.
-  -b range  consider only a range of chars/bytes in each line (e.g. 1-3)
-  -d delim  split on delimiter (default: /\\s+/ aka whitespace)
+  -/ sep    set value of \$/, perl's input record separator. default is \\n.
+            one of -w -W -n is highly recommended with this option.
+  -b range  consider only a range of chars/bytes in each record (e.g. 1-3)
+  -d delim  delimiter for -f (default: /\\s+/ aka whitespace)
   -f field  consider only this (delimiter-separated) field
   -i        case insensitive (actually, lowercases all input)
-  -w        remove leading and trailing whitespace from input lines
-  -W        remove ALL whitespace from input lines
-  -e code   execute perl code for each line of input (should modify \$_,
+  -w        remove leading and trailing whitespace from input records
+  -W        remove ALL whitespace from input records
+  -n        remove all non-word (\\W) characters from input records
+  -e code   execute perl code for each input record (should modify \$_,
             make sure you quote the argument as needed by your shell)
+  -k        skip blank records
+  -F        word frequency count. alias for -ink/' '
+
+Options that don't take arguments may be bundled: -BipW is the same as
+-B -i -p -W.
 
 Input will be read from filenames given on the command line, or from
-standard input if none given. The input need not be sorted. The output
-will always be sorted.
+standard input if none given, or if the filename - (hyphen) is given (use
+./- to read file a real file named -). The input need not be sorted. The
+output will always be sorted.
 
-Each line of input is chomped (has trailing \\n removed) before any
-further processing.
+Each input record is chomped before any further processing.
 
 -b is like the -b or -c option to cut(1) (depending on whether -B is
 set). It supports the same type of range as cut(1):
 
 N      N'th byte/character, counted from 1
-N-     from N'th byte/character to end of line
+N-     from N'th byte/character to end of record
 N-M    from N'th to M'th (included) byte/character
 -M     from first to M'th (included) byte/character
 
+...plus 2 extra types:
+
+-M-    from Mth-to-last byte/character to end of record (-1 = last)
+-M-N   from Mth-to-last byte/characters to Nth-to-last
+
 ...except that cut allows many ranges separated by commas, while $SELF
 -b only allows a single range.
 
--d is like the the -d option to cut(1), except that the delimiter can be
-multiple characters. Also, the delimiter is treated as a regular expression
-if it's at least 4 characters long *and* enclosed in //.
+-d is like the the -d option to cut(1), except that the delimiter can
+be multiple characters. Also, the delimiter is treated as a regular
+expression if it's at least 3 characters long *and* enclosed in //. The
+/i modifier is supported, but none of the other /x regex modifiers are.
 
 -f like cut's -f, except that it only allows a single field number (not
 a list), which is indexed starting from 1 (same as cut)... or a negative
 number, meaning the Nth field from the right (-1 = rightmost). Also
 unlike cut, -f and -b may be combined (-b is applied first).
 
-The -b -f -i -w -W -e<code> options will be processed in the order
-listed here, regardless of the order they're given on the command
+The -b -f -i -w -W -n -e<code> -k options will be processed in the
+order listed here, regardless of the order they're given on the command
 line. In particular, this means the code for -e will see \$_ *after*
-it's been modified by any of the other options.
+it's been modified by any of the other options (except -k).
+
+The code for -e will run with strict disabled and warnings enabled. To
+disable warnings, prefix the code with 'no warnings;'. There can only
+be one -e option, but it may be multiple lines of code separated with
+semicolons (like perl's own -e option). When the -e code runs, \$_
+contains the input (possibly tranformed by other options), and can
+be modified arbitratily. The -e code can filter out unwanted records by
+executing "next", which will cause them to be skipped entirely. Also,
+if the -k option is used, the code can 'undef \\$_' or assign \\$_=""
+to skip the current record.
 
-There can only be one -e option, but it may be multiple lines of code
-separated with semicolons (like perl's own -e option). When the -e code
-runs, \$_ contains the input (possibly tranformed by other options),
-and can be modified arbitratily. The -e code can filter out unwanted
-lines by executing "next", which will cause them to be skipped entirely.
 EOF
 }
 
@@ -109,12 +142,22 @@ sub hexify {
 }
 
 # main()
-getopts('hcpiwWte:d:f:b:xo:Bs:T:P', \%opt);
+our %opt;
+getopts('hcpiwWte:d:f:b:xo:Bs:T:P/:nkF', \%opt);
 
 HELP_MESSAGE() if $opt{h};
 
-die "$SELF: options -c and -p are mutually exclusive\n"
-	if $opt{c} && $opt{p};
+# -F = -ink/' '
+if($opt{F}) {
+	$opt{'/'} = ' ';
+	$opt{n} = $opt{k} = $opt{i} = 1;
+}
+
+# use an eval here so we can handle escapes like \t \n \r
+if(defined($opt{'/'})) {
+	$opt{'/'} =~ s/'/\\'/g; # to allow -/"'"
+	eval "\$/ = '" .  $opt{'/'} . "'";
+}
 
 if(defined $opt{o}) {
 	$opt{P} = 1;
@@ -123,22 +166,29 @@ if(defined $opt{o}) {
 }
 
 if(defined $opt{d}) {
-	if($opt{d} =~ m|^/(.{2,})/$|) {
-		$opt{d} = qr/$1/;
+	if($opt{d} =~ m(^/(.+)/(i?)$)) {
+		# qr/$1/$2 is a syntax error, so:
+		$opt{d} = $2 ? qr/$1/i : qr/$1/;
 	} else {
 		$opt{d} = quotemeta($opt{d});
 		$opt{d} = qr/$opt{d}/;
 	}
+	if(not defined($opt{f})) {
+		warn "$SELF: -d given without -f, which is pointless\n";
+	}
 } else {
 	$opt{d} = qr/\s+/;
 }
 
+our $substrarg;
 if(defined $opt{b}) {
 	for($opt{b}) {
-		/^(\d+)$/ && do { $substrarg = "$1 - 1, 1" };
-		/^(\d+)-$/ && do { $substrarg = "$1 - 1" };
-		/^-(\d+)$/ && do { $substrarg = "0, $1" };
-		/^(\d+)-(\d+)$/ && do { $substrarg = "$1 - 1, " . ($2 - $1 + 1) };
+		/^(\d+)$/ && do        { $substrarg = "$1 - 1, 1" };
+		/^(\d+)-$/ && do       { $substrarg = "$1 - 1" };
+		/^-(\d+)$/ && do       { $substrarg = "0, $1" };
+		/^(\d+)-(\d+)$/ && do  { $substrarg = "$1 - 1, " . ($2 - $1 + 1) };
+		/^-(\d+)-$/ && do      { $substrarg = "$1"; };
+		/^-(\d+)-(\d+)$/ && do { $substrarg = "$1, " . ($2 - $1); };
 	}
 	die "$SELF: invalid -b argument\n" unless $substrarg;
 }
@@ -147,27 +197,20 @@ if(defined $opt{b}) {
 # don't break using -1 for rightmost field)
 $opt{f}-- if defined $opt{f} && $opt{f} > 0;
 
+our ($revsort, $foldsort, $alphasort, $sortcode);
+
 if($opt{s}) {
 	for(split "", $opt{s}) {
-		/r/ && do { $revsort++;   };
-		/f/ && do { $foldsort++;  };
-		/c/ && do { $countsort++; };
-		/([^rfc])/ && do {
+		/r/ && do { $revsort++;                 };
+		/f/ && do { $foldsort++;  $alphasort++; };
+		/a/ && do { $alphasort++;               };
+		/([^rfa])/ && do {
 			warn "$SELF: ignoring unknown sort option '$1'\n"; };
 	}
 }
 
-if($countsort && $foldsort) {
-	die "$SELF: sort options 'c' and 'f' are mutually exclusive\n";
-}
-
-if($countsort) {
-	$sortcode = "{ " .
-		'$counts{' . ($revsort ? '$b' : '$a') . '}' .
-		" <=> " .
-		'$counts{' . ($revsort ? '$a' : '$b') . '}' .
-		"}";
-} else {
+# Sorry, this is kinda ugly.
+if($alphasort) {
 	$sortcode = "{ " .
 		($foldsort ? 'lc ' : "" ) .
 		($revsort ? '$b' : '$a') .
@@ -175,18 +218,24 @@ if($countsort) {
 		($foldsort ? 'lc ' : "" ) .
 		($revsort ? '$a' : '$b') .
 		"}";
+} else {
+	$sortcode = "{ " .
+		'$counts{' . ($revsort ? '$b' : '$a') . '}' .
+		" <=> " .
+		'$counts{' . ($revsort ? '$a' : '$b') . '}' .
+		"}";
 }
 
 # finally done with option processing, let the main event commence.
 
-%counts = ();
-$total = 0;
-$longest = 0;
-$readfiles = 0;
-$badfiles = 0;
+our %counts = ();
+our $total = 0;
+our $longest = 0;
+our $readfiles = 0;
+our $badfiles = 0;
 
 # Sadly, we can't use the magical while(<>) here to automatically iterate
-# and open all the files in @ARGV, because of the locale stuff. We need to
+# and open all the files in @ARGV, because of the -B option. We need to
 # call binmode() on each filehandle after it's opened, but before anything
 # gets read from it. 'use open ":bytes"' would set the default binmode,
 # but I couldn't get it to work conditionally (not even with eval).
@@ -220,20 +269,24 @@ for(@ARGV) {
 			$_ = "" unless defined $_;
 		}
 
-		if($opt{f}) {
-			$_ = (split($opt{d}))[$opt{f}];
+		if(defined $opt{f}) {
+			$_ = (split(/$opt{d}/))[$opt{f}];
 			$_ = "" unless defined $_;
 		}
 
 		$_ = lc if $opt{i};
 		s/^\s+|\s+$//g if $opt{w};
 		s/\s//g if $opt{W};
+		s/\W//g if $opt{n};
 
 		if($opt{e}) {
+			no strict;
 			no warnings qw/exiting/; # so -e code can "next" to skip a line
 			eval $opt{e};
 			die "$SELF: $@" if $@;
 		}
+		next if $opt{k} && (!defined || length == 0);
+		$_ = "" unless defined $_;
 
 		$counts{$_}++;
 		$total++;
@@ -251,7 +304,7 @@ if($opt{T}) {
 				(!$pct && ($counts{$_} < $thresh));
 		}
 	} else {
-		warn "$SELF: invalid argument for -T\n";
+		die "$SELF: invalid argument for -T\n";
 	}
 }
 
@@ -266,9 +319,8 @@ if(!$opt{P}) {
 for(sort { eval $sortcode } keys %counts) {
 	print ($opt{x} ? hexify($_) : $_);
 	print " " x ($longest - length) unless $opt{P};
-	print $opt{o};
-	print $counts{$_} . $opt{o} unless $opt{p};
-	printf "%.1f%%", ($counts{$_} * 100 / $total) unless $opt{c};
+	print $opt{o} . $counts{$_} unless $opt{p};
+	printf "$opt{o}%.1f%%", ($counts{$_} * 100 / $total) unless $opt{c};
 	print "\n";
 }
 
@@ -295,7 +347,7 @@ ls | bkt -e 's,^(.).*,$1,'
 # show percentages of stuff said by each user in an irssi IRC log. relies
 # on the log format having a timestamp, space, <nick> for normal lines.
 # misses /me actions entirely though.
-# add -src to show the most talkative first.
+# add -sr to show the most talkative first.
 bkt -f2 -e 'next unless /^\</' channelname.log
 
 # show us how many users use each shell (including stuff like /bin/false).
@@ -304,3 +356,6 @@ bkt -d: -f-1 /etc/passwd
 # how many images of each type have we got? ignore case, so JPG and jpg
 # are counted together.
 ls ~/images/*.* | bkt -d. -f-1
+
+# What percentage of words in a text file are capitalized?
+bkt -n/' ' -e's/^[A-Z]+$/CAPS/ || s/^[A-Z].*$/Caps/ || s/^[a-z].*$/lower/ || next' file.txt