aboutsummaryrefslogtreecommitdiff
path: root/bkt
diff options
context:
space:
mode:
authorB. Watson <yalhcru@gmail.com>2015-10-13 03:50:54 -0400
committerB. Watson <yalhcru@gmail.com>2015-10-13 03:50:54 -0400
commit172d0c5fbecd8978fe74adb5942250e40d59ee98 (patch)
treee22d65ff1e8b37ae020f10b53cea35e77ac85803 /bkt
parentc76bc179d5cdecc6a73a88f8ee47ff474e098e79 (diff)
downloadmisc-scripts-172d0c5fbecd8978fe74adb5942250e40d59ee98.tar.gz
use POD for --help, swap order of -b/-f options
Diffstat (limited to 'bkt')
-rwxr-xr-xbkt446
1 files changed, 309 insertions, 137 deletions
diff --git a/bkt b/bkt
index 8eb1f86..0fee2f4 100755
--- a/bkt
+++ b/bkt
@@ -1,138 +1,338 @@
#!/usr/bin/perl
-# by popular demand:
-use warnings;
-use strict;
+=pod
-# I wish there were a way to do this conditionally.
-# no, this didn't work: require 'open.pm'; ::open->import(':locale', ':std');
-use open ":locale", ":std";
+=head1 NAME
-use Getopt::Std;
-# this makes getopts exit after --help:
-$Getopt::Std::STANDARD_HELP_VERSION++;
+B<bkt> - count repeats in input
-(our $SELF = $0) =~ s,.*/,,;
-our $VERSION="0.0.1";
+=head1 SYNOPSIS
-sub HELP_MESSAGE {
- print <<EOF;
-$SELF - count repeats in input
+B<bkt> -h | --help
+
+B<bkt> -[cpiwWtxaBPnkFL] [...] [-e code] [-d delim ] [-f field] [-b list]
+[-o delim] [-s sortopts] [-T thresh[%]] [-/ separator] [-r recordsize]
+<file> <file ...>
-Usage: $SELF <options> <file> ...
+=head1 DESCRIPTION
+
+B<bkt> reads input from files or standard input, optionally transforms
+it according to various options, and counts like inputs. After all input
+is read, a count is given for the occurrence of each input.
Given the following input:
-foo
-foo
-bar
-bar
-baz
-
-$SELF will output:
-
-bar 2 40.0%
-baz 1 20.0%
-foo 2 40.0%
-
-The name 'bkt' comes from the concept of collecting like items in
-buckets. The original plan was to name this script 'bucketize', but
-who wants to type all that? Also, purely to support lazy typists, $SELF
-implements subsets of the functionality of cut(1) and sort(1).
-
-General options:
- --help
- -h display this help message
- --version display '$SELF $VERSION'
- -- end of options; everything after this is treated as a filename
-
-Output options:
- -c show counts only (suppress percentages)
- -p show percentages only (suppress counts)
- -c and -p may be combined, if you can find a use for it
- -t show total count
- -x print output in hexadecimal
- -a ASCII output: render non-ASCII characters as hex escapes
- -s opts output sort options. opts may include:
- r - reverse sort (default is ascending)
- a - sort alphabetically (default is by count)
- f - sort alphabetically, folding case
- -T thresh filter out results below threshold (which may be a
- count or a percentage, e.g. 5%).
- -o string use string as output delimiter (default: \\t). implies -P.
- -P don't pad output with spaces to length of longest element
- -o option enables this as well.
-
-Input options:
- -B binary mode (default: input is characters in current locale)
- -r int read input as fixed-size records (can't combine with -/)
- -/ sep set value of \$/, perl's input record separator. default is \\n.
- one of -w -W -n is highly recommended with this option.
- -b range consider only a range of chars/bytes in each record (e.g. 1-3)
- -d delim delimiter for -f (default: /\\s+/ aka whitespace)
- -f field consider only this (delimiter-separated) field
- -i case insensitive (actually, lowercases all input)
- -w remove leading and trailing whitespace from input records
- -W remove ALL whitespace from input records
- -n remove all non-word (\\W) characters from input records
- -e code execute perl code for each input record (should modify \$_,
- make sure you quote the argument as needed by your shell)
- -k skip blank records
- -F word frequency count. alias for -ink/' '
- -L letter frequency count. alias for -inkr1
-
-Options that don't take arguments may be bundled: -BipW is the same as
--B -i -p -W.
-
-Input will be read from filenames given on the command line, or from
-standard input if none given, or if the filename - (hyphen) is given (use
-./- to read file a real file named -). The input need not be sorted. The
-output will always be sorted.
+ foo
+ foo
+ bar
+ bar
+ baz
+
+B<bkt> will output:
+
+ bar 2 40.0%
+ baz 1 20.0%
+ foo 2 40.0%
+
+The name 'B<bkt>' comes from the concept of collecting like items in
+buckets. The original plan was to name this script 'bucketize', but who
+wants to type all that? Also, purely to support lazy typists, B<bkt>
+implements subsets of the functionality of B<cut>(1) and B<sort>(1).
+
+The utility of B<bkt> will be obvious, if you've written lots of
+variants of:
+
+ <shell commands> | perl -lne 's/\s.*//; $a{$_}++; END { print "$_ $a{$_}" for sort keys %a }'
+
+=head1 OPTIONS
+
+=head2 General options
+
+=over
+
+=item B<--help>, B<-h>
+
+Display this help message
+
+=item B<--version>
+
+Display version of bkt
+
+=item B<-->
+
+End of options. Everything after this is treated as a filename.
+
+=back
+
+=head2 Output options
+
+=over
+
+=item B<-c>
+
+Show counts only (suppress percentages).
+
+=item B<-p>
+
+Show percentages only (suppress counts).
+-c and -p may be combined, if you can find a use for it.
+
+=item B<-t>
+
+Show total count after all item counts.
+
+=item B<-x>
+
+Print output in hexadecimal.
+
+=item B<-a>
+
+ASCII output: render non-ASCII characters as hex escapes.
+
+=item B<-s> I<opts>
+
+Output sort options. Options may include:
+
+ r - reverse sort (default is ascending)
+ a - sort alphabetically (default is by count, then alpha)
+ f - fold case
+
+=item B<-T> I<thresh[%]>
+
+Filter out results below threshold, which may be a count or a percentage, e.g. 5%.
+
+=item B<-o> I<string>
+
+Use string as output delimiter (default: \\t). Implies -P.
+
+=item B<-P>
+
+Don't pad output with spaces to length of longest element. The -o option enables this as well.
+
+=back
+
+=head2 Input options
+
+=over
+
+=item B<-B>
+
+Byte mode. By default, input is treated as characters in the encoding
+specified by the current locale. B<-B> treats the input as a stream of
+8-bit bytes (octets, if you like).
+
+=item B<-r> I<recordsize>
+
+Read input as fixed-size records. This can't be combined with B<-/>.
+
+=item B<-/> I<sep>
+
+Set value of B<$/>, perl's input record separator. Default is I<\n>.
+One of I<-w> I<-W> I<-n> is highly recommended with this option. This
+can't be combined with B<-r>.
+
+=back
+
+=head2 Transform options
+
+=over
+
+=item B<-d> I<delim>
+
+Delimiter for B<-f>. Default: /\\s+/ aka whitespace. This can be a fixed
+string or a regular expression (if enclosed in //, with optional /i
+modifier). This option does nothing without B<-f>.
+
+=item B<-f> I<field>
+
+Consider only this (B<-d> delimiter separated) field.
+
+=item B<-b> I<range>
+
+Consider only a range of characters (or bytes, if B<-B>) in each record.
+Example: I<1-3> for the first 3 bytes/chars of each input record.
+
+=item B<-i>
+
+Case insensitive mode. Actually, lowercases all input. Use B<-sf>
+instead to sort output case-insensitively.
+
+=item B<-w>
+
+Remove leading and trailing whitespace from input records.
+
+=item B<-W>
+
+Remove ALL whitespace from input records.
+
+=item B<-n>
+
+Remove all non-word (I<\W>) characters from input records.
+
+=item B<-e> I<code>
+
+Execute perl code for each input record. The code should modify B<$_>.
+Make sure you quote the argument as needed by your shell.
+
+=item B<-k>
+
+Skip blank records. Basically the same as B<-e 'next if $_ eq ""'>.
+
+=item B<-F>
+
+Word frequency count. Alias for B<-ink/' '>.
+
+=item B<-L>
+
+Letter frequency count. Alias for B<-inkr1>.
+
+=back
+
+Options that don't take arguments may be bundled: B<-BipW> is the same as
+B<-B> B<-i> B<-p> B<-W>.
+
+Input will be read from filenames given on the command line, or
+from standard input if none given, or if the filename B<-> (hyphen) is
+given. Use B<./-> to read file a real file named B<->. The input need
+not be sorted. The output will always be sorted.
Each input record is chomped before any further processing.
--b is like the -b or -c option to cut(1) (depending on whether -B is
-set). It supports the same type of range as cut(1):
+B<-b> is like the B<-b> or B<-c> option to cut(1) (depending on whether B<-B> is
+set). It supports the same types of range as cut(1):
-N N'th byte/character, counted from 1
-N- from N'th byte/character to end of record
-N-M from N'th to M'th (included) byte/character
--M from first to M'th (included) byte/character
+ N N'th byte/character, counted from 1
+ N- from N'th byte/character to end of record
+ N-M from N'th to M'th (included) byte/character
+ -M from first to M'th (included) byte/character
...plus 2 extra types:
--M- from Mth-to-last byte/character to end of record (-1 = last)
--M-N from Mth-to-last byte/characters to Nth-to-last
+ -M- from Mth-to-last byte/character to end of record (-1 = last)
+ -M-N from Mth-to-last byte/characters to Nth-to-last
-...except that cut allows many ranges separated by commas, while $SELF
--b only allows a single range.
+...except that cut allows many ranges separated by commas, while B<bkt>
+B<-b> only allows a single range.
--d is like the the -d option to cut(1), except that the delimiter can
+B<-d> is like the the -d option to cut(1), except that the delimiter can
be multiple characters. Also, the delimiter is treated as a regular
-expression if it's at least 3 characters long *and* enclosed in //. The
+expression if it's at least 3 characters long *and* enclosed in I<//>. The
/i modifier is supported, but none of the other /x regex modifiers are.
--f like cut's -f, except that it only allows a single field number (not
+B<-f> like cut's B<-f>, except that it only allows a single field number (not
a list), which is indexed starting from 1 (same as cut)... or a negative
number, meaning the Nth field from the right (-1 = rightmost). Also
-unlike cut, -f and -b may be combined (-b is applied first).
+unlike cut, B<-f> and B<-b> may be combined (B<-f> is applied first).
-The -b -f -i -w -W -n -e<code> -k options will be processed in the
-order listed here, regardless of the order they're given on the command
-line. In particular, this means the code for -e will see \$_ *after*
-it's been modified by any of the other options (except -k).
+The B<-f> B<-b> B<-i> B<-w> B<-W> B<-n> B<-e>I<code> B<-k> options will
+be processed in the order listed here, regardless of the order they're
+given on the command line. In particular, this means the code for B<-e>
+will see B<$_> *after* it's been modified by any of the other options
+(except B<-k>).
-The code for -e will run with strict disabled and warnings enabled. To
+The code for B<-e> will run with strict disabled and warnings enabled. To
disable warnings, prefix the code with 'no warnings;'. There can only
-be one -e option, but it may be multiple lines of code separated with
-semicolons (like perl's own -e option). When the -e code runs, \$_
+be one B<-e> option, but it may be multiple lines of code separated with
+semicolons (like perl's own B<-e> option). When the B<-e> code runs, B<$_>
contains the input (possibly tranformed by other options), and can
-be modified arbitratily. The -e code can filter out unwanted records by
+be modified arbitratily. The B<-e> code can filter out unwanted records by
executing "next", which will cause them to be skipped entirely. Also,
-if the -k option is used, the code can 'undef \\$_' or assign \\$_=""
+if the B<-k> option is used, the code can B<undef $_> or assign B<$_="">
to skip the current record.
-EOF
+The astute reader will have noticed that all the other transform options
+could be written as code for B<-e>. This is correct: the other options
+exist to support lazy typists such as the author.
+
+=head1 EXAMPLES
+
+Show the percentage of binaries that start with each letter/number/etc,
+4 different ways.
+
+ cd /usr/bin
+ ls | bkt -b1
+ ls | cut -b1 | bkt
+ ls | bkt -e '$_=substr($_,0,1)'
+ ls | bkt -e 's,^(.).*,$1,'
+
+Show percentages of lines said by each user in an irssi IRC log. Relies
+on the log format having a timestamp, space, <nick> for normal lines.
+Misses /me actions entirely though. Add -sr to show the most talkative
+first.
+
+ bkt -f2 -e 'next unless /^\</' channelname.log
+
+Show us how many users use each shell (including stuff like /bin/false).
+
+ bkt -d: -f-1 /etc/passwd
+
+How many images of each type have we got? Ignore case, so JPG and jpg
+are counted together.
+
+ ls ~/images/*.* | bkt -i -d. -f-1
+
+What percentage of words in a text file are capitalized?
+
+ bkt -n/' ' -e's/^[A-Z]+$/CAPS/ || s/^[A-Z].*$/Caps/ || s/^[a-z].*$/lower/ || next' file.txt
+
+Given a CSV file with fields lastname, firstname, phonenumber:
+
+ Blow,Joe,444-555-0123
+ Showers,April,876-333-9874
+ ...etc...
+
+...to get a breakdown by area code:
+
+ bkt -d, -f2 -b1-3
+
+Suppose you have a team of people working on a large C++ or Java codebase.
+By convention, TODO comments are written as:
+
+ // TODO bob: Support non-Unicode locales
+
+...where "bob" is the coder assigned to that TODO item. You can get
+summary of these with:
+
+ find . -name '*.c++' | xargs bkt -e 'm,//\s+TODO\s+(\w+),||next; $_=$1;'
+
+...which might show something like:
+
+ john 3 3.7%
+ bill 18 22.2%
+ jane 23 28.4%
+ bob 37 45.7%
+
+=head1 AUTHOR
+
+B. Watson <yalhcru@gmail.com>
+
+=head1 LICENSE
+
+WTFPL. See http://www.wtfpl.net/txt/copying/ for full text of license.
+
+=head1 SEE ALSO
+
+B<cut>(1), B<sort>(1), B<perl>(1)
+
+=cut
+
+# by popular demand:
+use warnings;
+use strict;
+
+# I wish there were a way to do this conditionally.
+# no, this didn't work: require 'open.pm'; ::open->import(':locale', ':std');
+use open ":locale", ":std";
+
+use Getopt::Std;
+# this makes getopts exit after --help:
+$Getopt::Std::STANDARD_HELP_VERSION++;
+
+(our $SELF = $0) =~ s,.*/,,;
+our $VERSION="0.0.1";
+
+sub HELP_MESSAGE {
+ exec "perldoc $0";
}
sub VERSION_MESSAGE {
@@ -327,14 +527,14 @@ for(@ARGV) {
# behave like cut for -b/-f: no warnings if -f3 but only 2 fields exist,
# or -b10 but only 9 characters exist.
- if($substrarg) { # set via $opt{b}
- no warnings qw/substr/;
- eval "\$_ = substr(\$_, $substrarg)";
+ if(defined $opt{f}) {
+ $_ = (split(/$opt{d}/))[$opt{f}];
$_ = "" unless defined $_;
}
- if(defined $opt{f}) {
- $_ = (split(/$opt{d}/))[$opt{f}];
+ if($substrarg) { # set via $opt{b}
+ no warnings qw/substr/;
+ eval "\$_ = substr(\$_, $substrarg)";
$_ = "" unless defined $_;
}
@@ -395,31 +595,3 @@ if($opt{t}) {
# be like cat, exit with error status if any input file couldn't be
# read (even if we did successfully read others)
exit($badfiles != 0);
-
-__END__
-
-Examples:
-
-# show the percentage of binaries that start with each letter/number/etc,
-# 4 different ways
-cd /usr/bin
-ls | bkt -b1
-ls | cut -b1 | bkt
-ls | bkt -e '$_=substr($_,0,1)'
-ls | bkt -e 's,^(.).*,$1,'
-
-# show percentages of stuff said by each user in an irssi IRC log. relies
-# on the log format having a timestamp, space, <nick> for normal lines.
-# misses /me actions entirely though.
-# add -sr to show the most talkative first.
-bkt -f2 -e 'next unless /^\</' channelname.log
-
-# show us how many users use each shell (including stuff like /bin/false).
-bkt -d: -f-1 /etc/passwd
-
-# how many images of each type have we got? ignore case, so JPG and jpg
-# are counted together.
-ls ~/images/*.* | bkt -d. -f-1
-
-# What percentage of words in a text file are capitalized?
-bkt -n/' ' -e's/^[A-Z]+$/CAPS/ || s/^[A-Z].*$/Caps/ || s/^[a-z].*$/lower/ || next' file.txt