From 10453a597b4186b1e4ed9e85c07afa3c88c7502f Mon Sep 17 00:00:00 2001
From: "B. Watson" <yalhcru@gmail.com>
Date: Tue, 13 Oct 2015 18:02:56 -0400
Subject: bkt: -sn -H opts, doc fixes, sorting speedup

---
 bkt | 94 ++++++++++++++++++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 71 insertions(+), 23 deletions(-)
diff --git a/bkt b/bkt
index ffa31b6..eb1092c 100755
--- a/bkt
+++ b/bkt
@@ -121,7 +121,7 @@ Delimiter for B<-f>. Default: /\\s+/ aka whitespace. This can be a literal
 string or a regular expression (if enclosed in //, with optional /i
 modifier). As a special case, I<//> alone is treated as a literal string.
 This option does nothing without B<-f>. B<-d>'s argument is used with
-perl's B<split>, so you might read I<perdoc -f split> to understand this.
+perl's B<split>, so you might read I<perldoc -f split> to understand this.
 
 =item B<-f> I<field>
 
@@ -179,7 +179,7 @@ Grep for records not containing regex. Equivalent to: B<-e 'next if /regex/'>.
 =item B<-e> I<code>
 
 Execute perl code for each input record. The code should modify B<$_>.
-Make sure you quote the argument as needed by your shell. See NOTES
+Make sure you quote the argument as needed by your shell. See OPERATION
 below for more information.
 
 =item B<-k>
@@ -224,6 +224,13 @@ was implemented for debugging purposes, but it might be useful for other
 stuff. When B<-C> is used, B<-a> and B<-x> still work, but none of the
 other output options have any effect.
 
+=item B<-H>
+
+Print a crude ASCII art histogram. The histogram's width is half the
+width of the terminal, as reported by the COLUMNS environment variable
+(or by B<tput cols>, if COLUMNS not set), so the width can be controlled
+by setting COLUMNS manually.
+
 =item B<-x>
 
 Print output records as hexadecimal.
@@ -237,7 +244,8 @@ ASCII output: render non-ASCII characters as hex escapes.
 Output sort options. Options may include:
 
 	r - reverse sort (default is ascending)
-	a - sort alphabetically (default is by count, then alpha)
+	a - sort alphabetically by record (default is by count, then alpha)
+	n - sort numerically by record
 	f - fold case
 
 =item B<-T> I<thresh[%]>
@@ -255,7 +263,7 @@ The B<-o> option enables this as well.
 
 =back
 
-=head1 NOTES
+=head1 OPERATION
 
 Input will be read from filenames given on the command line, or
 from standard input if none given, or if the filename B<-> (hyphen) is
@@ -264,26 +272,31 @@ not be sorted. The output will always be sorted.
 
 Each input record is chomped before any further processing.
 
-The B<-l> B<-f> B<-b> B<-i> B<-w> B<-W> B<-n> B<-g> B<-v> B<-e>I<code>
-B<-k> options will be applied to each record in the order listed here,
-regardless of the order they're given on the command line. In particular,
-this means the code for B<-e> will see B<$_> *after* it's been modified
-by any of the other options (except B<-k>).
+The transform options (B<-l> B<-f> B<-b> B<-i> B<-w> B<-W> B<-n> B<-g>
+B<-v> B<-e>I<code> B<-k>) will be applied to each record in the order
+listed here, regardless of the order they're given on the command line. In
+particular, this means the code for B<-e> will see B<$_> *after* it's
+been modified by any of the other options (except B<-k>). In the case
+of B<-l>, B<-g>, B<-v>, the B<-e> code will not be run for records not
+matched by the options.
 
 The code for B<-e> will run with strict disabled and warnings enabled. To
 disable warnings, prefix the code with 'no warnings;'. There can only
 be one B<-e> option, but it may be multiple lines of code separated with
-semicolons (like perl's own B<-e> option). When the B<-e> code runs, B<$_>
-contains the input (possibly tranformed by other options), and can
-be modified arbitratily. The B<-e> code can filter out unwanted records by
-executing "next", which will cause them to be skipped entirely. Also,
-if the B<-k> option is used, the code can B<undef $_> or assign B<$_="">
-to skip the current record.
+semicolons. When the B<-e> code runs, B<$_> contains the input (possibly
+tranformed by other options), and can be modified arbitratily. The B<-e>
+code can filter out unwanted records by executing "next", which will
+cause them to be skipped entirely. Also, if the B<-k> option is used,
+the code can B<undef $_> or assign B<$_=""> to skip the current record.
 
 The astute reader will have noticed that all the other transform options
 could be written as code for B<-e>. This is correct: the other options
 exist to support lazy typists such as the author.
 
+After each record of input is read and any transform options applied
+to it, I<$counts{$_}++> is executed. After all input is read, output is
+generated via I<keys %counts>.
+
 =head1 EXIT STATUS
 
 B<bkt> exits with zero (success) status if all operations were successful,
@@ -359,6 +372,14 @@ but it's still a lot more keystrokes.
 
 --
 
+Plot a histogram of word lengths in a file of words:
+
+	bkt -H -sn -e '$_=length' /usr/share/dict/words
+
+...which should show a nice bell-curve distrubution.
+
+--
+
 What percentage of words in a text file are capitalized?
 
 	bkt -n/' ' -e's/^[A-Z]+$/CAPS/ || s/^[A-Z].*$/Caps/ || s/^[a-z].*$/lower/ || next' file.txt
@@ -442,6 +463,16 @@ sub hexify {
 	return join(" ", @out);
 }
 
+sub print_histogram {
+	my $value = shift;
+	my $maxval = shift;
+	my $columns = shift;
+
+	my $hist = " " x $columns;
+	substr($hist, int($value / $maxval * ($columns - 1)), 1) = "*";
+	print $hist;
+}
+
 # this bit of 300 baud linenoise replaces all non-ascii characters
 # with {\x00} style hex escapes. best viewed with a colorful syntax
 # highlighting editor (I like vim).
@@ -472,7 +503,7 @@ sub render {
 }
 
 # main()
-getopts('hcpiwWte:d:f:b:xao:Bs:T:P/:nkFr:Ll:g:v:C', \our %opt);
+getopts('hcpiwWte:d:f:b:xao:Bs:T:P/:nkFr:Ll:g:v:CH', \our %opt);
 
 # -h == --help
 HELP_MESSAGE(), exit(0) if $opt{h};
@@ -566,17 +597,21 @@ if(defined $opt{l}) {
 $opt{f}-- if defined $opt{f} && $opt{f} > 0;
 
 # handle the various -s sub-options
-our ($revsort, $foldsort, $alphasort);
+our ($revsort, $foldsort, $alphasort, $numsort);
 if($opt{s}) {
 	for(split "", $opt{s}) {
 		/r/ && do { $revsort++;   };
 		/f/ && do { $foldsort++;  };
 		/a/ && do { $alphasort++; };
-		/([^rfa])/ && do {
+		/n/ && do { $numsort++;   };
+		/([^rfan])/ && do {
 			warn "$SELF: ignoring unknown sort option '$1'\n"; };
 	}
 }
 
+warn "$SELF: sort opts a and n conflict, ignoring a\n" if $alphasort && $numsort;
+warn "$SELF: sort opts f and n conflict, ignoring f\n" if $foldsort && $numsort;
+
 # construct a string of perl code to implement the sort, according
 # to the options given. sorry, this is kinda ugly.
 our ($a, $b, $A, $B);
@@ -585,7 +620,9 @@ $b = $revsort ? '$a' : '$b';
 ($A, $B) = $foldsort ? ("lc $a", "lc $b") : ($a,  $b);
 
 our $sortcode = $A .  " cmp " .  $B;
-if($alphasort) {
+if($numsort) {
+	$sortcode = "{ $A <=> $B }"
+} elsif($alphasort) {
 	$sortcode = "{ $sortcode }";
 } else {
 	$sortcode = "{ (\$counts{$a} <=> \$counts{$b}) || ($sortcode) }";
@@ -651,7 +688,6 @@ for(@ARGV) {
 			my $in = $_;
 			no warnings qw/substr/;
 			eval "\$out .= substr(\$in, $_)" for(@substrargs);
-			die $@ if $@;
 			$_ = $out;
 			$_ = "" unless defined $_;
 		}
@@ -682,6 +718,7 @@ for(@ARGV) {
 
 die "$SELF: couldn't read any input files\n" unless $readfiles;
 
+# done reading & counting all input, show the results.
 if(!$opt{C}) {
 	if($opt{T}) {
 		(my ($thresh, $pct)) = ($opt{T} =~ /^(\d+)(%?)/);
@@ -696,15 +733,26 @@ if(!$opt{C}) {
 		}
 	}
 
-	if(!$opt{P}) {
+	my $maxval = 0;
+	if($opt{H} || !$opt{P}) {
 		for(keys %counts) {
 			my $l = length(render($_));
 			$longest = $l if $longest < $l;
+			$maxval = $counts{$_} if $counts{$_} > $maxval;
 		}
 	}
 
-# done reading & counting all input, show the results.
-	for(sort { eval $sortcode } keys %counts) {
+	my $histwidth; # TODO: parameterize
+	if($opt{H}) {
+		chomp($histwidth = int(($ENV{COLUMNS} || `tput cols 2>/dev/null` || 80) / 2));
+	}
+
+	# do this instead of using sort { eval $sortcode } in the loop,
+	# since eval is slow.
+	my $sortsub = eval "sub " . $sortcode;
+
+	for(sort { $sortsub } keys %counts) {
+		(print_histogram($counts{$_}, $maxval, $histwidth), print $opt{o}) if $opt{H};
 		print (my $printable = render($_));
 		print " " x ($longest - length($printable)) unless $opt{P};
 		print $opt{o} . $counts{$_} unless $opt{p};
-- 
cgit v1.2.3