From 5af624fefabe2e2c1344d18c2b76127db42c736d Mon Sep 17 00:00:00 2001 From: "B. Watson" Date: Thu, 8 Oct 2015 17:02:13 -0400 Subject: bkt: lots of new features --- bkt | 264 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 244 insertions(+), 20 deletions(-) diff --git a/bkt b/bkt index 42ea6ea..b6668f2 100755 --- a/bkt +++ b/bkt @@ -1,10 +1,12 @@ -#!/usr/bin/perl -w +#!/usr/bin/perl +use warnings; use Getopt::Std; +use open ":locale"; $Getopt::Std::STANDARD_HELP_VERSION++; ($SELF = $0) =~ s,.*/,,; -$VERSION="0.0.0"; +$VERSION="0.0.1"; sub HELP_MESSAGE { print < ... -Options are: +General options: --help -h display this help message + +Output options: -c show counts only (suppress percentages) -p show percentages only (suppress counts) + -t show total count + -x print output in hexadecimal + -s opts output sort options. opts may include: + r - reverse sort + f - fold case + c - sort by count rather than input string + -T thresh filter out results below threshold (which may be a + count or a percentage, e.g. 5%). + -P don't pad output with spaces to length of longest element + -o option enables this as well. + +Input options: + -B binary mode (default: input is characters in current locale) + -o string use string as output delimiter (default: \\t). implies -P. + -b range consider only a range of chars/bytes in each line (e.g. 1-3) + -d delim split on delimiter (default: /\\s+/ aka whitespace) + -f field consider only this (delimiter-separated) field + -i case insensitive (actually, lowercases all input) -w remove leading and trailing whitespace from input lines -W remove ALL whitespace from input lines - -t show total count -e code execute perl code for each line of input (should modify \$_, make sure you quote the argument as needed by your shell) Input will be read from filenames given on the command line, or from -standard input if none given. The input need not be sorted. +standard input if none given. The input need not be sorted. The output +will always be sorted. + +Each line of input is chomped (has trailing \\n removed) before any +further processing. + +-b is like the -b or -c option to cut(1) (depending on whether -B is +set). It supports the same type of range as cut(1): + +N N'th byte/character, counted from 1 +N- from N'th byte/character to end of line +N-M from N'th to M'th (included) byte/character +-M from first to M'th (included) byte/character + +...except that cut allows many ranges separated by commas, while $SELF +-b only allows a single range. + +-d is like the the -d option to cut(1), except that the delimiter can be +multiple characters. Also, the delimiter is treated as a regular expression +if it's at least 4 characters long *and* enclosed in //. + +-f like cut's -f, except that it only allows a single field number (not +a list), which is indexed starting from 1 (same as cut)... or a negative +number, meaning the Nth field from the right (-1 = rightmost). Also +unlike cut, -f and -b may be combined (-b is applied first). + +The -b -f -i -w -W -e options will be processed in the order +listed here, regardless of the order they're given on the command +line. In particular, this means the code for -e will see \$_ *after* +it's been modified by any of the other options. + +There can only be one -e option, but it may be multiple lines of code +separated with semicolons (like perl's own -e option). When the -e code +runs, \$_ contains the input (possibly tranformed by other options), +and can be modified arbitratily. The -e code can filter out unwanted +lines by executing "next", which will cause them to be skipped entirely. EOF } @@ -46,31 +102,172 @@ sub VERSION_MESSAGE { print "$SELF $VERSION\n"; } -getopts('hcpwWte:', \%opt); +sub hexify { + my @out = (); + push @out, sprintf("%x", ord($_)) for split "", $_[0]; + return join(" ", @out); +} + +# main() +getopts('hcpiwWte:d:f:b:xo:Bs:T:P', \%opt); HELP_MESSAGE() if $opt{h}; -die "$SELF: can't give -c and -p options together\n" +die "$SELF: options -c and -p are mutually exclusive\n" if $opt{c} && $opt{p}; +if(defined $opt{o}) { + $opt{P} = 1; +} else { + $opt{o} = "\t"; +} + +if(defined $opt{d}) { + if($opt{d} =~ m|^/(.{2,})/$|) { + $opt{d} = qr/$1/; + } else { + $opt{d} = quotemeta($opt{d}); + $opt{d} = qr/$opt{d}/; + } +} else { + $opt{d} = qr/\s+/; +} + +if(defined $opt{b}) { + for($opt{b}) { + /^(\d+)$/ && do { $substrarg = "$1 - 1, 1" }; + /^(\d+)-$/ && do { $substrarg = "$1 - 1" }; + /^-(\d+)$/ && do { $substrarg = "0, $1" }; + /^(\d+)-(\d+)$/ && do { $substrarg = "$1 - 1, " . ($2 - $1 + 1) }; + } + die "$SELF: invalid -b argument\n" unless $substrarg; +} + +# -f index starts at 1, perl arrays are indexed from 0, fix (but +# don't break using -1 for rightmost field) +$opt{f}-- if defined $opt{f} && $opt{f} > 0; + +if($opt{s}) { + for(split "", $opt{s}) { + /r/ && do { $revsort++; }; + /f/ && do { $foldsort++; }; + /c/ && do { $countsort++; }; + /([^rfc])/ && do { + warn "$SELF: ignoring unknown sort option '$1'\n"; }; + } +} + +if($countsort && $foldsort) { + die "$SELF: sort options 'c' and 'f' are mutually exclusive\n"; +} + +if($countsort) { + $sortcode = "{ " . + '$counts{' . ($revsort ? '$b' : '$a') . '}' . + " <=> " . + '$counts{' . ($revsort ? '$a' : '$b') . '}' . + "}"; +} else { + $sortcode = "{ " . + ($foldsort ? 'lc ' : "" ) . + ($revsort ? '$b' : '$a') . + " cmp " . + ($foldsort ? 'lc ' : "" ) . + ($revsort ? '$a' : '$b') . + "}"; +} + +# finally done with option processing, let the main event commence. + %counts = (); $total = 0; +$longest = 0; +$readfiles = 0; +$badfiles = 0; + +# Sadly, we can't use the magical while(<>) here to automatically iterate +# and open all the files in @ARGV, because of the locale stuff. We need to +# call binmode() on each filehandle after it's opened, but before anything +# gets read from it. 'use open ":bytes"' would set the default binmode, +# but I couldn't get it to work conditionally (not even with eval). + +$ARGV[0] = '-' unless @ARGV; +for(@ARGV) { + my $fh; + + if($_ eq '-') { + $fh = \*STDIN; + } else { + open $fh, '<', $_ or do { + warn "$SELF: $_: $!\n"; + $badfiles++; + next; + }; + } + + binmode $fh, ":bytes" if $opt{B}; + $readfiles++; + + while(<$fh>) { + chomp; + + # behave like cut for -b/-f: no warnings if -f3 but only 2 fields exist, + # or -b10 but only 9 characters exist. + + if($substrarg) { # set via $opt{b} + no warnings qw/substr/; + eval "\$_ = substr(\$_, $substrarg)"; + $_ = "" unless defined $_; + } + + if($opt{f}) { + $_ = (split($opt{d}))[$opt{f}]; + $_ = "" unless defined $_; + } + + $_ = lc if $opt{i}; + s/^\s+|\s+$//g if $opt{w}; + s/\s//g if $opt{W}; + + if($opt{e}) { + no warnings qw/exiting/; # so -e code can "next" to skip a line + eval $opt{e}; + die "$SELF: $@" if $@; + } -while(<>) { - chomp; - if($opt{e}) { - eval $opt{e}; - die "$SELF: $@" if $@; + $counts{$_}++; + $total++; } - s/^\s+|\s+$//g if $opt{w}; - s/\s//g if $opt{W}; - $counts{$_}++; - $total++; } -for(sort keys %counts) { - print $_ . "\t"; - print $counts{$_} . "\t" unless $opt{p}; +die "$SELF: couldn't read any input files\n" unless $readfiles; + +if($opt{T}) { + (my ($thresh, $pct)) = ($opt{T} =~ /^(\d+)(%?)/); + if($thresh) { + for(keys %counts) { + delete $counts{$_} if + ($pct && (($counts{$_} * 100 / $total) < $thresh)) || + (!$pct && ($counts{$_} < $thresh)); + } + } else { + warn "$SELF: invalid argument for -T\n"; + } +} + +if(!$opt{P}) { + for(keys %counts) { + my $l = length($opt{x} ? hexify($_) : $_); + $longest = $l if $longest < $l; + } +} + +# done reading & counting all input, show the results. +for(sort { eval $sortcode } keys %counts) { + print ($opt{x} ? hexify($_) : $_); + print " " x ($longest - length) unless $opt{P}; + print $opt{o}; + print $counts{$_} . $opt{o} unless $opt{p}; printf "%.1f%%", ($counts{$_} * 100 / $total) unless $opt{c}; print "\n"; } @@ -79,4 +276,31 @@ if($opt{t}) { print "\n-- Total count: $total\n"; } -exit(0); +# be like cat, exit with error status if any input file couldn't be +# read (even if we did successfully read others) +exit($badfiles != 0); + +__END__ + +Examples: + +# show the percentage of binaries that start with each letter/number/etc, +# 4 different ways +cd /usr/bin +ls | bkt -b1 +ls | cut -b1 | bkt +ls | bkt -e '$_=substr($_,0,1)' +ls | bkt -e 's,^(.).*,$1,' + +# show percentages of stuff said by each user in an irssi IRC log. relies +# on the log format having a timestamp, space, for normal lines. +# misses /me actions entirely though. +# add -src to show the most talkative first. +bkt -f2 -e 'next unless /^\