From 10453a597b4186b1e4ed9e85c07afa3c88c7502f Mon Sep 17 00:00:00 2001 From: "B. Watson" Date: Tue, 13 Oct 2015 18:02:56 -0400 Subject: bkt: -sn -H opts, doc fixes, sorting speedup --- bkt | 94 ++++++++++++++++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 71 insertions(+), 23 deletions(-) diff --git a/bkt b/bkt index ffa31b6..eb1092c 100755 --- a/bkt +++ b/bkt @@ -121,7 +121,7 @@ Delimiter for B<-f>. Default: /\\s+/ aka whitespace. This can be a literal string or a regular expression (if enclosed in //, with optional /i modifier). As a special case, I alone is treated as a literal string. This option does nothing without B<-f>. B<-d>'s argument is used with -perl's B, so you might read I to understand this. +perl's B, so you might read I to understand this. =item B<-f> I @@ -179,7 +179,7 @@ Grep for records not containing regex. Equivalent to: B<-e 'next if /regex/'>. =item B<-e> I Execute perl code for each input record. The code should modify B<$_>. -Make sure you quote the argument as needed by your shell. See NOTES +Make sure you quote the argument as needed by your shell. See OPERATION below for more information. =item B<-k> @@ -224,6 +224,13 @@ was implemented for debugging purposes, but it might be useful for other stuff. When B<-C> is used, B<-a> and B<-x> still work, but none of the other output options have any effect. +=item B<-H> + +Print a crude ASCII art histogram. The histogram's width is half the +width of the terminal, as reported by the COLUMNS environment variable +(or by B, if COLUMNS not set), so the width can be controlled +by setting COLUMNS manually. + =item B<-x> Print output records as hexadecimal. @@ -237,7 +244,8 @@ ASCII output: render non-ASCII characters as hex escapes. Output sort options. Options may include: r - reverse sort (default is ascending) - a - sort alphabetically (default is by count, then alpha) + a - sort alphabetically by record (default is by count, then alpha) + n - sort numerically by record f - fold case =item B<-T> I @@ -255,7 +263,7 @@ The B<-o> option enables this as well. =back -=head1 NOTES +=head1 OPERATION Input will be read from filenames given on the command line, or from standard input if none given, or if the filename B<-> (hyphen) is @@ -264,26 +272,31 @@ not be sorted. The output will always be sorted. Each input record is chomped before any further processing. -The B<-l> B<-f> B<-b> B<-i> B<-w> B<-W> B<-n> B<-g> B<-v> B<-e>I -B<-k> options will be applied to each record in the order listed here, -regardless of the order they're given on the command line. In particular, -this means the code for B<-e> will see B<$_> *after* it's been modified -by any of the other options (except B<-k>). +The transform options (B<-l> B<-f> B<-b> B<-i> B<-w> B<-W> B<-n> B<-g> +B<-v> B<-e>I B<-k>) will be applied to each record in the order +listed here, regardless of the order they're given on the command line. In +particular, this means the code for B<-e> will see B<$_> *after* it's +been modified by any of the other options (except B<-k>). In the case +of B<-l>, B<-g>, B<-v>, the B<-e> code will not be run for records not +matched by the options. The code for B<-e> will run with strict disabled and warnings enabled. To disable warnings, prefix the code with 'no warnings;'. There can only be one B<-e> option, but it may be multiple lines of code separated with -semicolons (like perl's own B<-e> option). When the B<-e> code runs, B<$_> -contains the input (possibly tranformed by other options), and can -be modified arbitratily. The B<-e> code can filter out unwanted records by -executing "next", which will cause them to be skipped entirely. Also, -if the B<-k> option is used, the code can B or assign B<$_=""> -to skip the current record. +semicolons. When the B<-e> code runs, B<$_> contains the input (possibly +tranformed by other options), and can be modified arbitratily. The B<-e> +code can filter out unwanted records by executing "next", which will +cause them to be skipped entirely. Also, if the B<-k> option is used, +the code can B or assign B<$_=""> to skip the current record. The astute reader will have noticed that all the other transform options could be written as code for B<-e>. This is correct: the other options exist to support lazy typists such as the author. +After each record of input is read and any transform options applied +to it, I<$counts{$_}++> is executed. After all input is read, output is +generated via I. + =head1 EXIT STATUS B exits with zero (success) status if all operations were successful, @@ -359,6 +372,14 @@ but it's still a lot more keystrokes. -- +Plot a histogram of word lengths in a file of words: + + bkt -H -sn -e '$_=length' /usr/share/dict/words + +...which should show a nice bell-curve distrubution. + +-- + What percentage of words in a text file are capitalized? bkt -n/' ' -e's/^[A-Z]+$/CAPS/ || s/^[A-Z].*$/Caps/ || s/^[a-z].*$/lower/ || next' file.txt @@ -442,6 +463,16 @@ sub hexify { return join(" ", @out); } +sub print_histogram { + my $value = shift; + my $maxval = shift; + my $columns = shift; + + my $hist = " " x $columns; + substr($hist, int($value / $maxval * ($columns - 1)), 1) = "*"; + print $hist; +} + # this bit of 300 baud linenoise replaces all non-ascii characters # with {\x00} style hex escapes. best viewed with a colorful syntax # highlighting editor (I like vim). @@ -472,7 +503,7 @@ sub render { } # main() -getopts('hcpiwWte:d:f:b:xao:Bs:T:P/:nkFr:Ll:g:v:C', \our %opt); +getopts('hcpiwWte:d:f:b:xao:Bs:T:P/:nkFr:Ll:g:v:CH', \our %opt); # -h == --help HELP_MESSAGE(), exit(0) if $opt{h}; @@ -566,17 +597,21 @@ if(defined $opt{l}) { $opt{f}-- if defined $opt{f} && $opt{f} > 0; # handle the various -s sub-options -our ($revsort, $foldsort, $alphasort); +our ($revsort, $foldsort, $alphasort, $numsort); if($opt{s}) { for(split "", $opt{s}) { /r/ && do { $revsort++; }; /f/ && do { $foldsort++; }; /a/ && do { $alphasort++; }; - /([^rfa])/ && do { + /n/ && do { $numsort++; }; + /([^rfan])/ && do { warn "$SELF: ignoring unknown sort option '$1'\n"; }; } } +warn "$SELF: sort opts a and n conflict, ignoring a\n" if $alphasort && $numsort; +warn "$SELF: sort opts f and n conflict, ignoring f\n" if $foldsort && $numsort; + # construct a string of perl code to implement the sort, according # to the options given. sorry, this is kinda ugly. our ($a, $b, $A, $B); @@ -585,7 +620,9 @@ $b = $revsort ? '$a' : '$b'; ($A, $B) = $foldsort ? ("lc $a", "lc $b") : ($a, $b); our $sortcode = $A . " cmp " . $B; -if($alphasort) { +if($numsort) { + $sortcode = "{ $A <=> $B }" +} elsif($alphasort) { $sortcode = "{ $sortcode }"; } else { $sortcode = "{ (\$counts{$a} <=> \$counts{$b}) || ($sortcode) }"; @@ -651,7 +688,6 @@ for(@ARGV) { my $in = $_; no warnings qw/substr/; eval "\$out .= substr(\$in, $_)" for(@substrargs); - die $@ if $@; $_ = $out; $_ = "" unless defined $_; } @@ -682,6 +718,7 @@ for(@ARGV) { die "$SELF: couldn't read any input files\n" unless $readfiles; +# done reading & counting all input, show the results. if(!$opt{C}) { if($opt{T}) { (my ($thresh, $pct)) = ($opt{T} =~ /^(\d+)(%?)/); @@ -696,15 +733,26 @@ if(!$opt{C}) { } } - if(!$opt{P}) { + my $maxval = 0; + if($opt{H} || !$opt{P}) { for(keys %counts) { my $l = length(render($_)); $longest = $l if $longest < $l; + $maxval = $counts{$_} if $counts{$_} > $maxval; } } -# done reading & counting all input, show the results. - for(sort { eval $sortcode } keys %counts) { + my $histwidth; # TODO: parameterize + if($opt{H}) { + chomp($histwidth = int(($ENV{COLUMNS} || `tput cols 2>/dev/null` || 80) / 2)); + } + + # do this instead of using sort { eval $sortcode } in the loop, + # since eval is slow. + my $sortsub = eval "sub " . $sortcode; + + for(sort { $sortsub } keys %counts) { + (print_histogram($counts{$_}, $maxval, $histwidth), print $opt{o}) if $opt{H}; print (my $printable = render($_)); print " " x ($longest - length($printable)) unless $opt{P}; print $opt{o} . $counts{$_} unless $opt{p}; -- cgit v1.2.3