1 files changed, 309 insertions, 137 deletions
diff --git a/bkt b/bkt
index 8eb1f86..0fee2f4 100755
--- a/bkt
+++ b/bkt
@@ -1,138 +1,338 @@
 #!/usr/bin/perl
 
-# by popular demand:
-use warnings;
-use strict;
+=pod
 
-# I wish there were a way to do this conditionally.
-# no, this didn't work: require 'open.pm'; ::open->import(':locale', ':std');
-use open ":locale", ":std";
+=head1 NAME
 
-use Getopt::Std;
-# this makes getopts exit after --help:
-$Getopt::Std::STANDARD_HELP_VERSION++;
+B<bkt> - count repeats in input
 
-(our $SELF = $0) =~ s,.*/,,;
-our $VERSION="0.0.1";
+=head1 SYNOPSIS
 
-sub HELP_MESSAGE {
-	print <<EOF;
-$SELF - count repeats in input
+B<bkt> -h | --help
+
+B<bkt> -[cpiwWtxaBPnkFL] [...] [-e code] [-d delim ] [-f field] [-b list]
+[-o delim] [-s sortopts] [-T thresh[%]] [-/ separator] [-r recordsize]
+<file> <file ...>
 
-Usage: $SELF <options> <file> ...
+=head1 DESCRIPTION
+
+B<bkt> reads input from files or standard input, optionally transforms
+it according to various options, and counts like inputs. After all input
+is read, a count is given for the occurrence of each input.
 
 Given the following input:
 
-foo
-foo
-bar
-bar
-baz
-
-$SELF will output:
-
-bar	2	40.0%
-baz	1	20.0%
-foo	2	40.0%
-
-The name 'bkt' comes from the concept of collecting like items in
-buckets. The original plan was to name this script 'bucketize', but
-who wants to type all that? Also, purely to support lazy typists, $SELF
-implements subsets of the functionality of cut(1) and sort(1).
-
-General options:
-  --help
-  -h        display this help message
-  --version display '$SELF $VERSION'
-  --        end of options; everything after this is treated as a filename
-
-Output options:
-  -c        show counts only (suppress percentages)
-  -p        show percentages only (suppress counts)
-            -c and -p may be combined, if you can find a use for it
-  -t        show total count
-  -x        print output in hexadecimal
-  -a        ASCII output: render non-ASCII characters as hex escapes
-  -s opts   output sort options. opts may include:
-            r - reverse sort (default is ascending)
-            a - sort alphabetically (default is by count)
-            f - sort alphabetically, folding case
-  -T thresh filter out results below threshold (which may be a
-            count or a percentage, e.g. 5%).
-  -o string use string as output delimiter (default: \\t). implies -P.
-  -P        don't pad output with spaces to length of longest element
-            -o option enables this as well.
-
-Input options:
-  -B        binary mode (default: input is characters in current locale)
-  -r int    read input as fixed-size records (can't combine with -/)
-  -/ sep    set value of \$/, perl's input record separator. default is \\n.
-            one of -w -W -n is highly recommended with this option.
-  -b range  consider only a range of chars/bytes in each record (e.g. 1-3)
-  -d delim  delimiter for -f (default: /\\s+/ aka whitespace)
-  -f field  consider only this (delimiter-separated) field
-  -i        case insensitive (actually, lowercases all input)
-  -w        remove leading and trailing whitespace from input records
-  -W        remove ALL whitespace from input records
-  -n        remove all non-word (\\W) characters from input records
-  -e code   execute perl code for each input record (should modify \$_,
-            make sure you quote the argument as needed by your shell)
-  -k        skip blank records
-  -F        word frequency count. alias for -ink/' '
-  -L        letter frequency count. alias for -inkr1
-
-Options that don't take arguments may be bundled: -BipW is the same as
--B -i -p -W.
-
-Input will be read from filenames given on the command line, or from
-standard input if none given, or if the filename - (hyphen) is given (use
-./- to read file a real file named -). The input need not be sorted. The
-output will always be sorted.
+	foo
+	foo
+	bar
+	bar
+	baz
+
+B<bkt> will output:
+
+	bar	2	40.0%
+	baz	1	20.0%
+	foo	2	40.0%
+
+The name 'B<bkt>' comes from the concept of collecting like items in
+buckets. The original plan was to name this script 'bucketize', but who
+wants to type all that? Also, purely to support lazy typists, B<bkt>
+implements subsets of the functionality of B<cut>(1) and B<sort>(1).
+
+The utility of B<bkt> will be obvious, if you've written lots of
+variants of:
+
+	<shell commands> | perl -lne 's/\s.*//; $a{$_}++; END { print "$_ $a{$_}" for sort keys %a }'
+
+=head1 OPTIONS
+
+=head2 General options
+
+=over
+
+=item B<--help>, B<-h>
+
+Display this help message
+
+=item B<--version>
+
+Display version of bkt
+
+=item B<-->
+
+End of options. Everything after this is treated as a filename.
+
+=back
+
+=head2 Output options
+
+=over
+
+=item B<-c>
+
+Show counts only (suppress percentages).
+
+=item B<-p>
+
+Show percentages only (suppress counts).
+-c and -p may be combined, if you can find a use for it.
+
+=item B<-t>
+
+Show total count after all item counts.
+
+=item B<-x>
+
+Print output in hexadecimal.
+
+=item B<-a>
+
+ASCII output: render non-ASCII characters as hex escapes.
+
+=item B<-s> I<opts>
+
+Output sort options. Options may include:
+
+	r - reverse sort (default is ascending)
+	a - sort alphabetically (default is by count, then alpha)
+	f - fold case
+
+=item B<-T> I<thresh[%]>
+
+Filter out results below threshold, which may be a count or a percentage, e.g. 5%.
+
+=item B<-o> I<string>
+
+Use string as output delimiter (default: \\t). Implies -P.
+
+=item B<-P>
+
+Don't pad output with spaces to length of longest element. The -o option enables this as well.
+
+=back
+
+=head2 Input options
+
+=over
+
+=item B<-B>
+
+Byte mode. By default, input is treated as characters in the encoding
+specified by the current locale. B<-B> treats the input as a stream of
+8-bit bytes (octets, if you like).
+
+=item B<-r> I<recordsize>
+
+Read input as fixed-size records. This can't be combined with B<-/>.
+
+=item B<-/> I<sep>
+
+Set value of B<$/>, perl's input record separator. Default is I<\n>.
+One of I<-w> I<-W> I<-n> is highly recommended with this option. This
+can't be combined with B<-r>.
+
+=back
+
+=head2 Transform options
+
+=over
+
+=item B<-d> I<delim>
+
+Delimiter for B<-f>. Default: /\\s+/ aka whitespace. This can be a fixed
+string or a regular expression (if enclosed in //, with optional /i
+modifier). This option does nothing without B<-f>.
+
+=item B<-f> I<field>
+
+Consider only this (B<-d> delimiter separated) field.
+
+=item B<-b> I<range>
+
+Consider only a range of characters (or bytes, if B<-B>) in each record.
+Example: I<1-3> for the first 3 bytes/chars of each input record.
+
+=item B<-i>
+
+Case insensitive mode. Actually, lowercases all input. Use B<-sf>
+instead to sort output case-insensitively.
+
+=item B<-w>
+
+Remove leading and trailing whitespace from input records.
+
+=item B<-W>
+
+Remove ALL whitespace from input records.
+
+=item B<-n>
+
+Remove all non-word (I<\W>) characters from input records.
+
+=item B<-e> I<code>
+
+Execute perl code for each input record. The code should modify B<$_>.
+Make sure you quote the argument as needed by your shell.
+
+=item B<-k>
+
+Skip blank records. Basically the same as B<-e 'next if $_ eq ""'>.
+
+=item B<-F>
+
+Word frequency count. Alias for B<-ink/' '>.
+
+=item B<-L>
+
+Letter frequency count. Alias for B<-inkr1>.
+
+=back
+
+Options that don't take arguments may be bundled: B<-BipW> is the same as
+B<-B> B<-i> B<-p> B<-W>.
+
+Input will be read from filenames given on the command line, or
+from standard input if none given, or if the filename B<-> (hyphen) is
+given. Use B<./-> to read file a real file named B<->. The input need
+not be sorted. The output will always be sorted.
 
 Each input record is chomped before any further processing.
 
--b is like the -b or -c option to cut(1) (depending on whether -B is
-set). It supports the same type of range as cut(1):
+B<-b> is like the B<-b> or B<-c> option to cut(1) (depending on whether B<-B> is
+set). It supports the same types of range as cut(1):
 
-N      N'th byte/character, counted from 1
-N-     from N'th byte/character to end of record
-N-M    from N'th to M'th (included) byte/character
--M     from first to M'th (included) byte/character
+	N      N'th byte/character, counted from 1
+	N-     from N'th byte/character to end of record
+	N-M    from N'th to M'th (included) byte/character
+	-M     from first to M'th (included) byte/character
 
 ...plus 2 extra types:
 
--M-    from Mth-to-last byte/character to end of record (-1 = last)
--M-N   from Mth-to-last byte/characters to Nth-to-last
+	-M-    from Mth-to-last byte/character to end of record (-1 = last)
+	-M-N   from Mth-to-last byte/characters to Nth-to-last
 
-...except that cut allows many ranges separated by commas, while $SELF
--b only allows a single range.
+...except that cut allows many ranges separated by commas, while B<bkt>
+B<-b> only allows a single range.
 
--d is like the the -d option to cut(1), except that the delimiter can
+B<-d> is like the the -d option to cut(1), except that the delimiter can
 be multiple characters. Also, the delimiter is treated as a regular
-expression if it's at least 3 characters long *and* enclosed in //. The
+expression if it's at least 3 characters long *and* enclosed in I<//>. The
 /i modifier is supported, but none of the other /x regex modifiers are.
 
--f like cut's -f, except that it only allows a single field number (not
+B<-f> like cut's B<-f>, except that it only allows a single field number (not
 a list), which is indexed starting from 1 (same as cut)... or a negative
 number, meaning the Nth field from the right (-1 = rightmost). Also
-unlike cut, -f and -b may be combined (-b is applied first).
+unlike cut, B<-f> and B<-b> may be combined (B<-f> is applied first).
 
-The -b -f -i -w -W -n -e<code> -k options will be processed in the
-order listed here, regardless of the order they're given on the command
-line. In particular, this means the code for -e will see \$_ *after*
-it's been modified by any of the other options (except -k).
+The B<-f> B<-b> B<-i> B<-w> B<-W> B<-n> B<-e>I<code> B<-k> options will
+be processed in the order listed here, regardless of the order they're
+given on the command line. In particular, this means the code for B<-e>
+will see B<$_> *after* it's been modified by any of the other options
+(except B<-k>).
 
-The code for -e will run with strict disabled and warnings enabled. To
+The code for B<-e> will run with strict disabled and warnings enabled. To
 disable warnings, prefix the code with 'no warnings;'. There can only
-be one -e option, but it may be multiple lines of code separated with
-semicolons (like perl's own -e option). When the -e code runs, \$_
+be one B<-e> option, but it may be multiple lines of code separated with
+semicolons (like perl's own B<-e> option). When the B<-e> code runs, B<$_>
 contains the input (possibly tranformed by other options), and can
-be modified arbitratily. The -e code can filter out unwanted records by
+be modified arbitratily. The B<-e> code can filter out unwanted records by
 executing "next", which will cause them to be skipped entirely. Also,
-if the -k option is used, the code can 'undef \\$_' or assign \\$_=""
+if the B<-k> option is used, the code can B<undef $_> or assign B<$_="">
 to skip the current record.
 
-EOF
+The astute reader will have noticed that all the other transform options
+could be written as code for B<-e>. This is correct: the other options
+exist to support lazy typists such as the author.
+
+=head1 EXAMPLES
+
+Show the percentage of binaries that start with each letter/number/etc,
+4 different ways.
+
+	cd /usr/bin
+	ls | bkt -b1
+	ls | cut -b1 | bkt
+	ls | bkt -e '$_=substr($_,0,1)'
+	ls | bkt -e 's,^(.).*,$1,'
+
+Show percentages of lines said by each user in an irssi IRC log. Relies
+on the log format having a timestamp, space, <nick> for normal lines.
+Misses /me actions entirely though. Add -sr to show the most talkative
+first.
+
+	bkt -f2 -e 'next unless /^\</' channelname.log
+
+Show us how many users use each shell (including stuff like /bin/false).
+
+	bkt -d: -f-1 /etc/passwd
+
+How many images of each type have we got? Ignore case, so JPG and jpg
+are counted together.
+
+	ls ~/images/*.* | bkt -i -d. -f-1
+
+What percentage of words in a text file are capitalized?
+
+	bkt -n/' ' -e's/^[A-Z]+$/CAPS/ || s/^[A-Z].*$/Caps/ || s/^[a-z].*$/lower/ || next' file.txt
+
+Given a CSV file with fields lastname, firstname, phonenumber:
+
+	Blow,Joe,444-555-0123
+	Showers,April,876-333-9874
+   ...etc...
+
+...to get a breakdown by area code:
+
+   bkt -d, -f2 -b1-3
+
+Suppose you have a team of people working on a large C++ or Java codebase.
+By convention, TODO comments are written as:
+
+  // TODO bob: Support non-Unicode locales
+
+...where "bob" is the coder assigned to that TODO item. You can get
+summary of these with:
+
+  find . -name '*.c++' | xargs bkt -e 'm,//\s+TODO\s+(\w+),||next; $_=$1;'
+
+...which might show something like:
+
+   john   3    3.7%
+   bill   18   22.2%
+   jane   23   28.4%
+   bob    37   45.7%
+
+=head1 AUTHOR
+
+B. Watson <yalhcru@gmail.com>
+
+=head1 LICENSE
+
+WTFPL. See http://www.wtfpl.net/txt/copying/ for full text of license.
+
+=head1 SEE ALSO
+
+B<cut>(1), B<sort>(1), B<perl>(1)
+
+=cut
+
+# by popular demand:
+use warnings;
+use strict;
+
+# I wish there were a way to do this conditionally.
+# no, this didn't work: require 'open.pm'; ::open->import(':locale', ':std');
+use open ":locale", ":std";
+
+use Getopt::Std;
+# this makes getopts exit after --help:
+$Getopt::Std::STANDARD_HELP_VERSION++;
+
+(our $SELF = $0) =~ s,.*/,,;
+our $VERSION="0.0.1";
+
+sub HELP_MESSAGE {
+	exec "perldoc $0";
 }
 
 sub VERSION_MESSAGE {
@@ -327,14 +527,14 @@ for(@ARGV) {
 		# behave like cut for -b/-f: no warnings if -f3 but only 2 fields exist,
 		# or -b10 but only 9 characters exist.
 
-		if($substrarg) { # set via $opt{b}
-			no warnings qw/substr/;
-			eval "\$_ = substr(\$_, $substrarg)";
+		if(defined $opt{f}) {
+			$_ = (split(/$opt{d}/))[$opt{f}];
 			$_ = "" unless defined $_;
 		}
 
-		if(defined $opt{f}) {
-			$_ = (split(/$opt{d}/))[$opt{f}];
+		if($substrarg) { # set via $opt{b}
+			no warnings qw/substr/;
+			eval "\$_ = substr(\$_, $substrarg)";
 			$_ = "" unless defined $_;
 		}
 
@@ -395,31 +595,3 @@ if($opt{t}) {
 # be like cat, exit with error status if any input file couldn't be
 # read (even if we did successfully read others)
 exit($badfiles != 0);
-
-__END__
-
-Examples:
-
-# show the percentage of binaries that start with each letter/number/etc,
-# 4 different ways
-cd /usr/bin
-ls | bkt -b1
-ls | cut -b1 | bkt
-ls | bkt -e '$_=substr($_,0,1)'
-ls | bkt -e 's,^(.).*,$1,'
-
-# show percentages of stuff said by each user in an irssi IRC log. relies
-# on the log format having a timestamp, space, <nick> for normal lines.
-# misses /me actions entirely though.
-# add -sr to show the most talkative first.
-bkt -f2 -e 'next unless /^\</' channelname.log
-
-# show us how many users use each shell (including stuff like /bin/false).
-bkt -d: -f-1 /etc/passwd
-
-# how many images of each type have we got? ignore case, so JPG and jpg
-# are counted together.
-ls ~/images/*.* | bkt -d. -f-1
-
-# What percentage of words in a text file are capitalized?
-bkt -n/' ' -e's/^[A-Z]+$/CAPS/ || s/^[A-Z].*$/Caps/ || s/^[a-z].*$/lower/ || next' file.txt