diff options
Diffstat (limited to 'bkt')
-rwxr-xr-x | bkt | 446 |
1 files changed, 309 insertions, 137 deletions
@@ -1,138 +1,338 @@ #!/usr/bin/perl -# by popular demand: -use warnings; -use strict; +=pod -# I wish there were a way to do this conditionally. -# no, this didn't work: require 'open.pm'; ::open->import(':locale', ':std'); -use open ":locale", ":std"; +=head1 NAME -use Getopt::Std; -# this makes getopts exit after --help: -$Getopt::Std::STANDARD_HELP_VERSION++; +B<bkt> - count repeats in input -(our $SELF = $0) =~ s,.*/,,; -our $VERSION="0.0.1"; +=head1 SYNOPSIS -sub HELP_MESSAGE { - print <<EOF; -$SELF - count repeats in input +B<bkt> -h | --help + +B<bkt> -[cpiwWtxaBPnkFL] [...] [-e code] [-d delim ] [-f field] [-b list] +[-o delim] [-s sortopts] [-T thresh[%]] [-/ separator] [-r recordsize] +<file> <file ...> -Usage: $SELF <options> <file> ... +=head1 DESCRIPTION + +B<bkt> reads input from files or standard input, optionally transforms +it according to various options, and counts like inputs. After all input +is read, a count is given for the occurrence of each input. Given the following input: -foo -foo -bar -bar -baz - -$SELF will output: - -bar 2 40.0% -baz 1 20.0% -foo 2 40.0% - -The name 'bkt' comes from the concept of collecting like items in -buckets. The original plan was to name this script 'bucketize', but -who wants to type all that? Also, purely to support lazy typists, $SELF -implements subsets of the functionality of cut(1) and sort(1). - -General options: - --help - -h display this help message - --version display '$SELF $VERSION' - -- end of options; everything after this is treated as a filename - -Output options: - -c show counts only (suppress percentages) - -p show percentages only (suppress counts) - -c and -p may be combined, if you can find a use for it - -t show total count - -x print output in hexadecimal - -a ASCII output: render non-ASCII characters as hex escapes - -s opts output sort options. opts may include: - r - reverse sort (default is ascending) - a - sort alphabetically (default is by count) - f - sort alphabetically, folding case - -T thresh filter out results below threshold (which may be a - count or a percentage, e.g. 5%). - -o string use string as output delimiter (default: \\t). implies -P. - -P don't pad output with spaces to length of longest element - -o option enables this as well. - -Input options: - -B binary mode (default: input is characters in current locale) - -r int read input as fixed-size records (can't combine with -/) - -/ sep set value of \$/, perl's input record separator. default is \\n. - one of -w -W -n is highly recommended with this option. - -b range consider only a range of chars/bytes in each record (e.g. 1-3) - -d delim delimiter for -f (default: /\\s+/ aka whitespace) - -f field consider only this (delimiter-separated) field - -i case insensitive (actually, lowercases all input) - -w remove leading and trailing whitespace from input records - -W remove ALL whitespace from input records - -n remove all non-word (\\W) characters from input records - -e code execute perl code for each input record (should modify \$_, - make sure you quote the argument as needed by your shell) - -k skip blank records - -F word frequency count. alias for -ink/' ' - -L letter frequency count. alias for -inkr1 - -Options that don't take arguments may be bundled: -BipW is the same as --B -i -p -W. - -Input will be read from filenames given on the command line, or from -standard input if none given, or if the filename - (hyphen) is given (use -./- to read file a real file named -). The input need not be sorted. The -output will always be sorted. + foo + foo + bar + bar + baz + +B<bkt> will output: + + bar 2 40.0% + baz 1 20.0% + foo 2 40.0% + +The name 'B<bkt>' comes from the concept of collecting like items in +buckets. The original plan was to name this script 'bucketize', but who +wants to type all that? Also, purely to support lazy typists, B<bkt> +implements subsets of the functionality of B<cut>(1) and B<sort>(1). + +The utility of B<bkt> will be obvious, if you've written lots of +variants of: + + <shell commands> | perl -lne 's/\s.*//; $a{$_}++; END { print "$_ $a{$_}" for sort keys %a }' + +=head1 OPTIONS + +=head2 General options + +=over + +=item B<--help>, B<-h> + +Display this help message + +=item B<--version> + +Display version of bkt + +=item B<--> + +End of options. Everything after this is treated as a filename. + +=back + +=head2 Output options + +=over + +=item B<-c> + +Show counts only (suppress percentages). + +=item B<-p> + +Show percentages only (suppress counts). +-c and -p may be combined, if you can find a use for it. + +=item B<-t> + +Show total count after all item counts. + +=item B<-x> + +Print output in hexadecimal. + +=item B<-a> + +ASCII output: render non-ASCII characters as hex escapes. + +=item B<-s> I<opts> + +Output sort options. Options may include: + + r - reverse sort (default is ascending) + a - sort alphabetically (default is by count, then alpha) + f - fold case + +=item B<-T> I<thresh[%]> + +Filter out results below threshold, which may be a count or a percentage, e.g. 5%. + +=item B<-o> I<string> + +Use string as output delimiter (default: \\t). Implies -P. + +=item B<-P> + +Don't pad output with spaces to length of longest element. The -o option enables this as well. + +=back + +=head2 Input options + +=over + +=item B<-B> + +Byte mode. By default, input is treated as characters in the encoding +specified by the current locale. B<-B> treats the input as a stream of +8-bit bytes (octets, if you like). + +=item B<-r> I<recordsize> + +Read input as fixed-size records. This can't be combined with B<-/>. + +=item B<-/> I<sep> + +Set value of B<$/>, perl's input record separator. Default is I<\n>. +One of I<-w> I<-W> I<-n> is highly recommended with this option. This +can't be combined with B<-r>. + +=back + +=head2 Transform options + +=over + +=item B<-d> I<delim> + +Delimiter for B<-f>. Default: /\\s+/ aka whitespace. This can be a fixed +string or a regular expression (if enclosed in //, with optional /i +modifier). This option does nothing without B<-f>. + +=item B<-f> I<field> + +Consider only this (B<-d> delimiter separated) field. + +=item B<-b> I<range> + +Consider only a range of characters (or bytes, if B<-B>) in each record. +Example: I<1-3> for the first 3 bytes/chars of each input record. + +=item B<-i> + +Case insensitive mode. Actually, lowercases all input. Use B<-sf> +instead to sort output case-insensitively. + +=item B<-w> + +Remove leading and trailing whitespace from input records. + +=item B<-W> + +Remove ALL whitespace from input records. + +=item B<-n> + +Remove all non-word (I<\W>) characters from input records. + +=item B<-e> I<code> + +Execute perl code for each input record. The code should modify B<$_>. +Make sure you quote the argument as needed by your shell. + +=item B<-k> + +Skip blank records. Basically the same as B<-e 'next if $_ eq ""'>. + +=item B<-F> + +Word frequency count. Alias for B<-ink/' '>. + +=item B<-L> + +Letter frequency count. Alias for B<-inkr1>. + +=back + +Options that don't take arguments may be bundled: B<-BipW> is the same as +B<-B> B<-i> B<-p> B<-W>. + +Input will be read from filenames given on the command line, or +from standard input if none given, or if the filename B<-> (hyphen) is +given. Use B<./-> to read file a real file named B<->. The input need +not be sorted. The output will always be sorted. Each input record is chomped before any further processing. --b is like the -b or -c option to cut(1) (depending on whether -B is -set). It supports the same type of range as cut(1): +B<-b> is like the B<-b> or B<-c> option to cut(1) (depending on whether B<-B> is +set). It supports the same types of range as cut(1): -N N'th byte/character, counted from 1 -N- from N'th byte/character to end of record -N-M from N'th to M'th (included) byte/character --M from first to M'th (included) byte/character + N N'th byte/character, counted from 1 + N- from N'th byte/character to end of record + N-M from N'th to M'th (included) byte/character + -M from first to M'th (included) byte/character ...plus 2 extra types: --M- from Mth-to-last byte/character to end of record (-1 = last) --M-N from Mth-to-last byte/characters to Nth-to-last + -M- from Mth-to-last byte/character to end of record (-1 = last) + -M-N from Mth-to-last byte/characters to Nth-to-last -...except that cut allows many ranges separated by commas, while $SELF --b only allows a single range. +...except that cut allows many ranges separated by commas, while B<bkt> +B<-b> only allows a single range. --d is like the the -d option to cut(1), except that the delimiter can +B<-d> is like the the -d option to cut(1), except that the delimiter can be multiple characters. Also, the delimiter is treated as a regular -expression if it's at least 3 characters long *and* enclosed in //. The +expression if it's at least 3 characters long *and* enclosed in I<//>. The /i modifier is supported, but none of the other /x regex modifiers are. --f like cut's -f, except that it only allows a single field number (not +B<-f> like cut's B<-f>, except that it only allows a single field number (not a list), which is indexed starting from 1 (same as cut)... or a negative number, meaning the Nth field from the right (-1 = rightmost). Also -unlike cut, -f and -b may be combined (-b is applied first). +unlike cut, B<-f> and B<-b> may be combined (B<-f> is applied first). -The -b -f -i -w -W -n -e<code> -k options will be processed in the -order listed here, regardless of the order they're given on the command -line. In particular, this means the code for -e will see \$_ *after* -it's been modified by any of the other options (except -k). +The B<-f> B<-b> B<-i> B<-w> B<-W> B<-n> B<-e>I<code> B<-k> options will +be processed in the order listed here, regardless of the order they're +given on the command line. In particular, this means the code for B<-e> +will see B<$_> *after* it's been modified by any of the other options +(except B<-k>). -The code for -e will run with strict disabled and warnings enabled. To +The code for B<-e> will run with strict disabled and warnings enabled. To disable warnings, prefix the code with 'no warnings;'. There can only -be one -e option, but it may be multiple lines of code separated with -semicolons (like perl's own -e option). When the -e code runs, \$_ +be one B<-e> option, but it may be multiple lines of code separated with +semicolons (like perl's own B<-e> option). When the B<-e> code runs, B<$_> contains the input (possibly tranformed by other options), and can -be modified arbitratily. The -e code can filter out unwanted records by +be modified arbitratily. The B<-e> code can filter out unwanted records by executing "next", which will cause them to be skipped entirely. Also, -if the -k option is used, the code can 'undef \\$_' or assign \\$_="" +if the B<-k> option is used, the code can B<undef $_> or assign B<$_=""> to skip the current record. -EOF +The astute reader will have noticed that all the other transform options +could be written as code for B<-e>. This is correct: the other options +exist to support lazy typists such as the author. + +=head1 EXAMPLES + +Show the percentage of binaries that start with each letter/number/etc, +4 different ways. + + cd /usr/bin + ls | bkt -b1 + ls | cut -b1 | bkt + ls | bkt -e '$_=substr($_,0,1)' + ls | bkt -e 's,^(.).*,$1,' + +Show percentages of lines said by each user in an irssi IRC log. Relies +on the log format having a timestamp, space, <nick> for normal lines. +Misses /me actions entirely though. Add -sr to show the most talkative +first. + + bkt -f2 -e 'next unless /^\</' channelname.log + +Show us how many users use each shell (including stuff like /bin/false). + + bkt -d: -f-1 /etc/passwd + +How many images of each type have we got? Ignore case, so JPG and jpg +are counted together. + + ls ~/images/*.* | bkt -i -d. -f-1 + +What percentage of words in a text file are capitalized? + + bkt -n/' ' -e's/^[A-Z]+$/CAPS/ || s/^[A-Z].*$/Caps/ || s/^[a-z].*$/lower/ || next' file.txt + +Given a CSV file with fields lastname, firstname, phonenumber: + + Blow,Joe,444-555-0123 + Showers,April,876-333-9874 + ...etc... + +...to get a breakdown by area code: + + bkt -d, -f2 -b1-3 + +Suppose you have a team of people working on a large C++ or Java codebase. +By convention, TODO comments are written as: + + // TODO bob: Support non-Unicode locales + +...where "bob" is the coder assigned to that TODO item. You can get +summary of these with: + + find . -name '*.c++' | xargs bkt -e 'm,//\s+TODO\s+(\w+),||next; $_=$1;' + +...which might show something like: + + john 3 3.7% + bill 18 22.2% + jane 23 28.4% + bob 37 45.7% + +=head1 AUTHOR + +B. Watson <yalhcru@gmail.com> + +=head1 LICENSE + +WTFPL. See http://www.wtfpl.net/txt/copying/ for full text of license. + +=head1 SEE ALSO + +B<cut>(1), B<sort>(1), B<perl>(1) + +=cut + +# by popular demand: +use warnings; +use strict; + +# I wish there were a way to do this conditionally. +# no, this didn't work: require 'open.pm'; ::open->import(':locale', ':std'); +use open ":locale", ":std"; + +use Getopt::Std; +# this makes getopts exit after --help: +$Getopt::Std::STANDARD_HELP_VERSION++; + +(our $SELF = $0) =~ s,.*/,,; +our $VERSION="0.0.1"; + +sub HELP_MESSAGE { + exec "perldoc $0"; } sub VERSION_MESSAGE { @@ -327,14 +527,14 @@ for(@ARGV) { # behave like cut for -b/-f: no warnings if -f3 but only 2 fields exist, # or -b10 but only 9 characters exist. - if($substrarg) { # set via $opt{b} - no warnings qw/substr/; - eval "\$_ = substr(\$_, $substrarg)"; + if(defined $opt{f}) { + $_ = (split(/$opt{d}/))[$opt{f}]; $_ = "" unless defined $_; } - if(defined $opt{f}) { - $_ = (split(/$opt{d}/))[$opt{f}]; + if($substrarg) { # set via $opt{b} + no warnings qw/substr/; + eval "\$_ = substr(\$_, $substrarg)"; $_ = "" unless defined $_; } @@ -395,31 +595,3 @@ if($opt{t}) { # be like cat, exit with error status if any input file couldn't be # read (even if we did successfully read others) exit($badfiles != 0); - -__END__ - -Examples: - -# show the percentage of binaries that start with each letter/number/etc, -# 4 different ways -cd /usr/bin -ls | bkt -b1 -ls | cut -b1 | bkt -ls | bkt -e '$_=substr($_,0,1)' -ls | bkt -e 's,^(.).*,$1,' - -# show percentages of stuff said by each user in an irssi IRC log. relies -# on the log format having a timestamp, space, <nick> for normal lines. -# misses /me actions entirely though. -# add -sr to show the most talkative first. -bkt -f2 -e 'next unless /^\</' channelname.log - -# show us how many users use each shell (including stuff like /bin/false). -bkt -d: -f-1 /etc/passwd - -# how many images of each type have we got? ignore case, so JPG and jpg -# are counted together. -ls ~/images/*.* | bkt -d. -f-1 - -# What percentage of words in a text file are capitalized? -bkt -n/' ' -e's/^[A-Z]+$/CAPS/ || s/^[A-Z].*$/Caps/ || s/^[a-z].*$/lower/ || next' file.txt |