aboutsummaryrefslogtreecommitdiff
path: root/bkt
diff options
context:
space:
mode:
authorB. Watson <yalhcru@gmail.com>2015-10-13 18:02:56 -0400
committerB. Watson <yalhcru@gmail.com>2015-10-13 18:02:56 -0400
commit10453a597b4186b1e4ed9e85c07afa3c88c7502f (patch)
treeec0b5f701f02045f65419b7ba3c1a3d2f819703d /bkt
parentb182e3f487614d87f6b79ee7d182c6f504d877af (diff)
downloadmisc-scripts-10453a597b4186b1e4ed9e85c07afa3c88c7502f.tar.gz
bkt: -sn -H opts, doc fixes, sorting speedup
Diffstat (limited to 'bkt')
-rwxr-xr-xbkt94
1 files changed, 71 insertions, 23 deletions
diff --git a/bkt b/bkt
index ffa31b6..eb1092c 100755
--- a/bkt
+++ b/bkt
@@ -121,7 +121,7 @@ Delimiter for B<-f>. Default: /\\s+/ aka whitespace. This can be a literal
string or a regular expression (if enclosed in //, with optional /i
modifier). As a special case, I<//> alone is treated as a literal string.
This option does nothing without B<-f>. B<-d>'s argument is used with
-perl's B<split>, so you might read I<perdoc -f split> to understand this.
+perl's B<split>, so you might read I<perldoc -f split> to understand this.
=item B<-f> I<field>
@@ -179,7 +179,7 @@ Grep for records not containing regex. Equivalent to: B<-e 'next if /regex/'>.
=item B<-e> I<code>
Execute perl code for each input record. The code should modify B<$_>.
-Make sure you quote the argument as needed by your shell. See NOTES
+Make sure you quote the argument as needed by your shell. See OPERATION
below for more information.
=item B<-k>
@@ -224,6 +224,13 @@ was implemented for debugging purposes, but it might be useful for other
stuff. When B<-C> is used, B<-a> and B<-x> still work, but none of the
other output options have any effect.
+=item B<-H>
+
+Print a crude ASCII art histogram. The histogram's width is half the
+width of the terminal, as reported by the COLUMNS environment variable
+(or by B<tput cols>, if COLUMNS not set), so the width can be controlled
+by setting COLUMNS manually.
+
=item B<-x>
Print output records as hexadecimal.
@@ -237,7 +244,8 @@ ASCII output: render non-ASCII characters as hex escapes.
Output sort options. Options may include:
r - reverse sort (default is ascending)
- a - sort alphabetically (default is by count, then alpha)
+ a - sort alphabetically by record (default is by count, then alpha)
+ n - sort numerically by record
f - fold case
=item B<-T> I<thresh[%]>
@@ -255,7 +263,7 @@ The B<-o> option enables this as well.
=back
-=head1 NOTES
+=head1 OPERATION
Input will be read from filenames given on the command line, or
from standard input if none given, or if the filename B<-> (hyphen) is
@@ -264,26 +272,31 @@ not be sorted. The output will always be sorted.
Each input record is chomped before any further processing.
-The B<-l> B<-f> B<-b> B<-i> B<-w> B<-W> B<-n> B<-g> B<-v> B<-e>I<code>
-B<-k> options will be applied to each record in the order listed here,
-regardless of the order they're given on the command line. In particular,
-this means the code for B<-e> will see B<$_> *after* it's been modified
-by any of the other options (except B<-k>).
+The transform options (B<-l> B<-f> B<-b> B<-i> B<-w> B<-W> B<-n> B<-g>
+B<-v> B<-e>I<code> B<-k>) will be applied to each record in the order
+listed here, regardless of the order they're given on the command line. In
+particular, this means the code for B<-e> will see B<$_> *after* it's
+been modified by any of the other options (except B<-k>). In the case
+of B<-l>, B<-g>, B<-v>, the B<-e> code will not be run for records not
+matched by the options.
The code for B<-e> will run with strict disabled and warnings enabled. To
disable warnings, prefix the code with 'no warnings;'. There can only
be one B<-e> option, but it may be multiple lines of code separated with
-semicolons (like perl's own B<-e> option). When the B<-e> code runs, B<$_>
-contains the input (possibly tranformed by other options), and can
-be modified arbitratily. The B<-e> code can filter out unwanted records by
-executing "next", which will cause them to be skipped entirely. Also,
-if the B<-k> option is used, the code can B<undef $_> or assign B<$_="">
-to skip the current record.
+semicolons. When the B<-e> code runs, B<$_> contains the input (possibly
+tranformed by other options), and can be modified arbitratily. The B<-e>
+code can filter out unwanted records by executing "next", which will
+cause them to be skipped entirely. Also, if the B<-k> option is used,
+the code can B<undef $_> or assign B<$_=""> to skip the current record.
The astute reader will have noticed that all the other transform options
could be written as code for B<-e>. This is correct: the other options
exist to support lazy typists such as the author.
+After each record of input is read and any transform options applied
+to it, I<$counts{$_}++> is executed. After all input is read, output is
+generated via I<keys %counts>.
+
=head1 EXIT STATUS
B<bkt> exits with zero (success) status if all operations were successful,
@@ -359,6 +372,14 @@ but it's still a lot more keystrokes.
--
+Plot a histogram of word lengths in a file of words:
+
+ bkt -H -sn -e '$_=length' /usr/share/dict/words
+
+...which should show a nice bell-curve distrubution.
+
+--
+
What percentage of words in a text file are capitalized?
bkt -n/' ' -e's/^[A-Z]+$/CAPS/ || s/^[A-Z].*$/Caps/ || s/^[a-z].*$/lower/ || next' file.txt
@@ -442,6 +463,16 @@ sub hexify {
return join(" ", @out);
}
+sub print_histogram {
+ my $value = shift;
+ my $maxval = shift;
+ my $columns = shift;
+
+ my $hist = " " x $columns;
+ substr($hist, int($value / $maxval * ($columns - 1)), 1) = "*";
+ print $hist;
+}
+
# this bit of 300 baud linenoise replaces all non-ascii characters
# with {\x00} style hex escapes. best viewed with a colorful syntax
# highlighting editor (I like vim).
@@ -472,7 +503,7 @@ sub render {
}
# main()
-getopts('hcpiwWte:d:f:b:xao:Bs:T:P/:nkFr:Ll:g:v:C', \our %opt);
+getopts('hcpiwWte:d:f:b:xao:Bs:T:P/:nkFr:Ll:g:v:CH', \our %opt);
# -h == --help
HELP_MESSAGE(), exit(0) if $opt{h};
@@ -566,17 +597,21 @@ if(defined $opt{l}) {
$opt{f}-- if defined $opt{f} && $opt{f} > 0;
# handle the various -s sub-options
-our ($revsort, $foldsort, $alphasort);
+our ($revsort, $foldsort, $alphasort, $numsort);
if($opt{s}) {
for(split "", $opt{s}) {
/r/ && do { $revsort++; };
/f/ && do { $foldsort++; };
/a/ && do { $alphasort++; };
- /([^rfa])/ && do {
+ /n/ && do { $numsort++; };
+ /([^rfan])/ && do {
warn "$SELF: ignoring unknown sort option '$1'\n"; };
}
}
+warn "$SELF: sort opts a and n conflict, ignoring a\n" if $alphasort && $numsort;
+warn "$SELF: sort opts f and n conflict, ignoring f\n" if $foldsort && $numsort;
+
# construct a string of perl code to implement the sort, according
# to the options given. sorry, this is kinda ugly.
our ($a, $b, $A, $B);
@@ -585,7 +620,9 @@ $b = $revsort ? '$a' : '$b';
($A, $B) = $foldsort ? ("lc $a", "lc $b") : ($a, $b);
our $sortcode = $A . " cmp " . $B;
-if($alphasort) {
+if($numsort) {
+ $sortcode = "{ $A <=> $B }"
+} elsif($alphasort) {
$sortcode = "{ $sortcode }";
} else {
$sortcode = "{ (\$counts{$a} <=> \$counts{$b}) || ($sortcode) }";
@@ -651,7 +688,6 @@ for(@ARGV) {
my $in = $_;
no warnings qw/substr/;
eval "\$out .= substr(\$in, $_)" for(@substrargs);
- die $@ if $@;
$_ = $out;
$_ = "" unless defined $_;
}
@@ -682,6 +718,7 @@ for(@ARGV) {
die "$SELF: couldn't read any input files\n" unless $readfiles;
+# done reading & counting all input, show the results.
if(!$opt{C}) {
if($opt{T}) {
(my ($thresh, $pct)) = ($opt{T} =~ /^(\d+)(%?)/);
@@ -696,15 +733,26 @@ if(!$opt{C}) {
}
}
- if(!$opt{P}) {
+ my $maxval = 0;
+ if($opt{H} || !$opt{P}) {
for(keys %counts) {
my $l = length(render($_));
$longest = $l if $longest < $l;
+ $maxval = $counts{$_} if $counts{$_} > $maxval;
}
}
-# done reading & counting all input, show the results.
- for(sort { eval $sortcode } keys %counts) {
+ my $histwidth; # TODO: parameterize
+ if($opt{H}) {
+ chomp($histwidth = int(($ENV{COLUMNS} || `tput cols 2>/dev/null` || 80) / 2));
+ }
+
+ # do this instead of using sort { eval $sortcode } in the loop,
+ # since eval is slow.
+ my $sortsub = eval "sub " . $sortcode;
+
+ for(sort { $sortsub } keys %counts) {
+ (print_histogram($counts{$_}, $maxval, $histwidth), print $opt{o}) if $opt{H};
print (my $printable = render($_));
print " " x ($longest - length($printable)) unless $opt{P};
print $opt{o} . $counts{$_} unless $opt{p};