aboutsummaryrefslogtreecommitdiff
path: root/bkt
diff options
context:
space:
mode:
Diffstat (limited to 'bkt')
-rwxr-xr-xbkt379
1 files changed, 252 insertions, 127 deletions
diff --git a/bkt b/bkt
index 0fee2f4..ffa31b6 100755
--- a/bkt
+++ b/bkt
@@ -44,9 +44,16 @@ variants of:
<shell commands> | perl -lne 's/\s.*//; $a{$_}++; END { print "$_ $a{$_}" for sort keys %a }'
+The above could be written as:
+
+ <shell commands> | bkt -f1
+
=head1 OPTIONS
-=head2 General options
+Options that don't take arguments may be bundled: B<-BipW> is the same
+as B<-B> B<-i> B<-p> B<-W>.
+
+=head2 General Options
=over
@@ -64,54 +71,9 @@ End of options. Everything after this is treated as a filename.
=back
-=head2 Output options
-
-=over
-
-=item B<-c>
-
-Show counts only (suppress percentages).
-
-=item B<-p>
-
-Show percentages only (suppress counts).
--c and -p may be combined, if you can find a use for it.
-
-=item B<-t>
-
-Show total count after all item counts.
-
-=item B<-x>
-
-Print output in hexadecimal.
-
-=item B<-a>
-
-ASCII output: render non-ASCII characters as hex escapes.
-
-=item B<-s> I<opts>
-
-Output sort options. Options may include:
-
- r - reverse sort (default is ascending)
- a - sort alphabetically (default is by count, then alpha)
- f - fold case
-
-=item B<-T> I<thresh[%]>
-
-Filter out results below threshold, which may be a count or a percentage, e.g. 5%.
-
-=item B<-o> I<string>
-
-Use string as output delimiter (default: \\t). Implies -P.
+=head2 Input Options
-=item B<-P>
-
-Don't pad output with spaces to length of longest element. The -o option enables this as well.
-
-=back
-
-=head2 Input options
+These options are applied to each input file when it's first opened for reading.
=over
@@ -123,34 +85,70 @@ specified by the current locale. B<-B> treats the input as a stream of
=item B<-r> I<recordsize>
-Read input as fixed-size records. This can't be combined with B<-/>.
+Read input as fixed-size records. This can't be combined with B<-/>,
+as it sets the value of B<$/> to a reference to its argument. See
+I<perldoc -v '$/'>.
-=item B<-/> I<sep>
+=item B<-/> I<separator>
Set value of B<$/>, perl's input record separator. Default is I<\n>.
-One of I<-w> I<-W> I<-n> is highly recommended with this option. This
+One of B<-w> B<-W> B<-n> is highly recommended with this option. This
can't be combined with B<-r>.
=back
-=head2 Transform options
+=head2 Transform Options
+
+These options are applied to each record of input, in the order listed
+here, before it's counted. If you haven't used B<-r> or B<-/>, a record
+is a line.
+
+The B<-f> and <-b> options can be used together, which is unlike B<cut>(1).
=over
+=item B<-l> I<range>
+
+Operate only on records numbered within the range, e.g. I<1-3> for the
+first 3 records of each file, I<10-> for 10th through the end, I<15>
+for only the 15th record. The starting record is optional and defaults
+to 1, so I<-3> could be used for the first example. If multiple files
+are given, the numbering resets to 1 at the start of each file.
+
=item B<-d> I<delim>
-Delimiter for B<-f>. Default: /\\s+/ aka whitespace. This can be a fixed
+Delimiter for B<-f>. Default: /\\s+/ aka whitespace. This can be a literal
string or a regular expression (if enclosed in //, with optional /i
-modifier). This option does nothing without B<-f>.
+modifier). As a special case, I<//> alone is treated as a literal string.
+This option does nothing without B<-f>. B<-d>'s argument is used with
+perl's B<split>, so you might read I<perdoc -f split> to understand this.
=item B<-f> I<field>
-Consider only this (B<-d> delimiter separated) field.
+Consider only this (B<-d> delimiter separated) field. Unlike B<cut>,
+only one field may be selected (B<cut>'s lists of fields are not
+supported). Also unlike B<cut>, negative field numbers can be used to
+index from the rightmost field (which is numbered B<-1>).
=item B<-b> I<range>
-Consider only a range of characters (or bytes, if B<-B>) in each record.
-Example: I<1-3> for the first 3 bytes/chars of each input record.
+Like B<cut -b>: Consider only a range of characters (or bytes, if B<-B>)
+in each record. Example: I<1-3> for the first 3 bytes/chars of each
+input record.
+
+B<-b> supports the same types of range as cut(1):
+
+ N N'th byte/character, counted from 1
+ N- from N'th byte/character to end of record
+ N-M from N'th to M'th (included) byte/character
+ -M from first to M'th (included) byte/character
+
+...plus 2 extra types:
+
+ -M- from Mth-to-last byte/character (included) to end of record (-1 = last)
+ -M-N from Mth-to-last byte/character to Nth-to-last (included)
+
+...including multiple ranges separated by commas.
=item B<-i>
@@ -169,10 +167,20 @@ Remove ALL whitespace from input records.
Remove all non-word (I<\W>) characters from input records.
+=item B<-g> I<regex>
+
+Grep for regex. Equivalent to: B<-e 'next unless /regex/'>. Remember
+this is a perl-style regex, not a B<grep>(1) one.
+
+=item B<-v> I<regex>
+
+Grep for records not containing regex. Equivalent to: B<-e 'next if /regex/'>.
+
=item B<-e> I<code>
Execute perl code for each input record. The code should modify B<$_>.
-Make sure you quote the argument as needed by your shell.
+Make sure you quote the argument as needed by your shell. See NOTES
+below for more information.
=item B<-k>
@@ -188,47 +196,79 @@ Letter frequency count. Alias for B<-inkr1>.
=back
-Options that don't take arguments may be bundled: B<-BipW> is the same as
-B<-B> B<-i> B<-p> B<-W>.
+=head2 Output Options
-Input will be read from filenames given on the command line, or
-from standard input if none given, or if the filename B<-> (hyphen) is
-given. Use B<./-> to read file a real file named B<->. The input need
-not be sorted. The output will always be sorted.
+These options are applied after all I/O and counting is done, and
+only affect how the output is printed.
-Each input record is chomped before any further processing.
+=over
-B<-b> is like the B<-b> or B<-c> option to cut(1) (depending on whether B<-B> is
-set). It supports the same types of range as cut(1):
+=item B<-c>
- N N'th byte/character, counted from 1
- N- from N'th byte/character to end of record
- N-M from N'th to M'th (included) byte/character
- -M from first to M'th (included) byte/character
+Show counts only (suppress percentages).
-...plus 2 extra types:
+=item B<-p>
- -M- from Mth-to-last byte/character to end of record (-1 = last)
- -M-N from Mth-to-last byte/characters to Nth-to-last
+Show percentages only (suppress counts).
+B<-c> and B<-p> may be combined, if you can find a use for it.
-...except that cut allows many ranges separated by commas, while B<bkt>
-B<-b> only allows a single range.
+=item B<-t>
+
+Show total count after all item counts.
+
+=item B<-C>
+
+Print the records themselves, instead of the counts. Allows B<bkt> to
+be used as a general-purpose text manipulation tool. Mainly this option
+was implemented for debugging purposes, but it might be useful for other
+stuff. When B<-C> is used, B<-a> and B<-x> still work, but none of the
+other output options have any effect.
+
+=item B<-x>
+
+Print output records as hexadecimal.
+
+=item B<-a>
+
+ASCII output: render non-ASCII characters as hex escapes.
+
+=item B<-s> I<sortopts>
+
+Output sort options. Options may include:
+
+ r - reverse sort (default is ascending)
+ a - sort alphabetically (default is by count, then alpha)
+ f - fold case
-B<-d> is like the the -d option to cut(1), except that the delimiter can
-be multiple characters. Also, the delimiter is treated as a regular
-expression if it's at least 3 characters long *and* enclosed in I<//>. The
-/i modifier is supported, but none of the other /x regex modifiers are.
+=item B<-T> I<thresh[%]>
+
+Filter out results below threshold, which may be a count or a percentage, e.g. 5%.
+
+=item B<-o> I<string>
-B<-f> like cut's B<-f>, except that it only allows a single field number (not
-a list), which is indexed starting from 1 (same as cut)... or a negative
-number, meaning the Nth field from the right (-1 = rightmost). Also
-unlike cut, B<-f> and B<-b> may be combined (B<-f> is applied first).
+Use string as output delimiter (default: \\t). Implies B<-P>.
+
+=item B<-P>
+
+Don't pad output with spaces to length of longest element.
+The B<-o> option enables this as well.
+
+=back
-The B<-f> B<-b> B<-i> B<-w> B<-W> B<-n> B<-e>I<code> B<-k> options will
-be processed in the order listed here, regardless of the order they're
-given on the command line. In particular, this means the code for B<-e>
-will see B<$_> *after* it's been modified by any of the other options
-(except B<-k>).
+=head1 NOTES
+
+Input will be read from filenames given on the command line, or
+from standard input if none given, or if the filename B<-> (hyphen) is
+given. Use B<./-> to read file a real file named B<->. The input need
+not be sorted. The output will always be sorted.
+
+Each input record is chomped before any further processing.
+
+The B<-l> B<-f> B<-b> B<-i> B<-w> B<-W> B<-n> B<-g> B<-v> B<-e>I<code>
+B<-k> options will be applied to each record in the order listed here,
+regardless of the order they're given on the command line. In particular,
+this means the code for B<-e> will see B<$_> *after* it's been modified
+by any of the other options (except B<-k>).
The code for B<-e> will run with strict disabled and warnings enabled. To
disable warnings, prefix the code with 'no warnings;'. There can only
@@ -244,6 +284,17 @@ The astute reader will have noticed that all the other transform options
could be written as code for B<-e>. This is correct: the other options
exist to support lazy typists such as the author.
+=head1 EXIT STATUS
+
+B<bkt> exits with zero (success) status if all operations were successful,
+otherwise non-zero. Currently, there are no specific non-zero exit codes
+(e.g. different ones for different error types), though this may change
+in the future.
+
+If some files couldn't be read, but at least one could, the file(s)
+that were readable are processed normally and the exit status will be
+non-zero. This mimics the behaviour of GNU cat(1), head(1), tail(1), etc.
+
=head1 EXAMPLES
Show the percentage of binaries that start with each letter/number/etc,
@@ -255,26 +306,65 @@ Show the percentage of binaries that start with each letter/number/etc,
ls | bkt -e '$_=substr($_,0,1)'
ls | bkt -e 's,^(.).*,$1,'
+--
+
Show percentages of lines said by each user in an irssi IRC log. Relies
on the log format having a timestamp, space, <nick> for normal lines.
Misses /me actions entirely though. Add -sr to show the most talkative
first.
- bkt -f2 -e 'next unless /^\</' channelname.log
+ bkt -f2 -g'<' channelname.log
+
+--
+
+We have a directory full of scripts, a mix of perl, python, shell, etc.
+How many scripts are written in each language? Skip any files that
+don't start with a #! (shebang) line.
+
+Using B<bkt> only:
+
+ bkt -l1 -g'#!' scripts/*
+
+Which is equivalent to:
+
+ head -qn1 scripts/* | grep '#!' | bkt
+
+If you wanted to make the same assumption the OS does, that a script
+missing its shebang line is a #!/bin/sh script:
+
+ bkt -l1 -e'$_="#!/bin/sh" unless /#!/' scripts/*
+
+If some of them might be ELF executables, add "-B -v ELF" to the above.
+
+--
Show us how many users use each shell (including stuff like /bin/false).
bkt -d: -f-1 /etc/passwd
+--
+
How many images of each type have we got? Ignore case, so JPG and jpg
are counted together.
ls ~/images/*.* | bkt -i -d. -f-1
+The above could have been written as:
+
+ ls ~/images/*.* | tr A-Z a-z | cut -d. -f2 | bkt
+
+...except it wouldn't handle filenames with multiple dots in them, like
+image.01.jpg. Replacing the cut with "sed 's,.*\.,,'" would fix that,
+but it's still a lot more keystrokes.
+
+--
+
What percentage of words in a text file are capitalized?
bkt -n/' ' -e's/^[A-Z]+$/CAPS/ || s/^[A-Z].*$/Caps/ || s/^[a-z].*$/lower/ || next' file.txt
+--
+
Given a CSV file with fields lastname, firstname, phonenumber:
Blow,Joe,444-555-0123
@@ -283,7 +373,9 @@ Given a CSV file with fields lastname, firstname, phonenumber:
...to get a breakdown by area code:
- bkt -d, -f2 -b1-3
+ bkt -d, -f3 -b1-3 phonelist.csv
+
+--
Suppose you have a team of people working on a large C++ or Java codebase.
By convention, TODO comments are written as:
@@ -302,6 +394,11 @@ summary of these with:
jane 23 28.4%
bob 37 45.7%
+If you're more comfortable with sed than perl, you could write the
+above as:
+
+ find . -name '*.c++' | xargs sed -n 's,.*// *TODO *\([^:]*\):.*,\1,p' | bkt
+
=head1 AUTHOR
B. Watson <yalhcru@gmail.com>
@@ -375,7 +472,7 @@ sub render {
}
# main()
-getopts('hcpiwWte:d:f:b:xao:Bs:T:P/:nkFr:L', \our %opt);
+getopts('hcpiwWte:d:f:b:xao:Bs:T:P/:nkFr:Ll:g:v:C', \our %opt);
# -h == --help
HELP_MESSAGE(), exit(0) if $opt{h};
@@ -437,17 +534,31 @@ if(defined $opt{d}) {
}
# handle -b arg
-our $substrarg;
+our @substrargs;
if(defined $opt{b}) {
- for($opt{b}) {
- /^(\d+)$/ && do { $substrarg = "$1 - 1, 1" };
- /^(\d+)-$/ && do { $substrarg = "$1 - 1" };
- /^-(\d+)$/ && do { $substrarg = "0, $1" };
- /^(\d+)-(\d+)$/ && do { $substrarg = "$1 - 1, " . ($2 - $1 + 1) };
- /^-(\d+)-$/ && do { $substrarg = "$1"; };
- /^-(\d+)-(\d+)$/ && do { $substrarg = "$1, " . ($2 - $1); };
+ my $s;
+ for(split /,/, $opt{b}) {
+ /^(\d+)$/ && do { $s = "$1 - 1, 1" };
+ /^(\d+)-$/ && do { $s = "$1 - 1" };
+ /^-(\d+)$/ && do { $s = "0, $1" };
+ /^(\d+)-(\d+)$/ && do { $s = "$1 - 1, " . ($2 - $1 + 1) };
+ /^-(\d+)-$/ && do { $s = "$1"; };
+ /^-(\d+)-(\d+)$/ && do { $s = "$1, " . ($2 - $1); };
+ die "$SELF: invalid -b argument\n" unless $s;
+ push @substrargs, $s;
+ }
+}
+
+# handle -l arg. similar to but simpler than -b arg.
+our ($startrec, $endrec);
+if(defined $opt{l}) {
+ for($opt{l}) {
+ /^(\d+)$/ && do { $startrec = $endrec = $1 };
+ /^(\d+)-$/ && do { $startrec = $1 };
+ /^-(\d+)$/ && do { $startrec = 1; $endrec = $1 };
+ /^(\d+)-(\d+)$/ && do { $startrec = $1 ; $endrec = $2 };
+ die "$SELF: invalid -l argument\n" unless $startrec;
}
- die "$SELF: invalid -b argument\n" unless $substrarg;
}
# -f index starts at 1, perl arrays are indexed from 0, fix (but
@@ -522,6 +633,9 @@ for(@ARGV) {
$readfiles++;
while(<$fh>) {
+ next if defined $startrec && $. < $startrec;
+ next if defined $endrec && $. > $endrec;
+
chomp;
# behave like cut for -b/-f: no warnings if -f3 but only 2 fields exist,
@@ -532,9 +646,13 @@ for(@ARGV) {
$_ = "" unless defined $_;
}
- if($substrarg) { # set via $opt{b}
+ if(@substrargs) { # set via $opt{b}
+ my $out = "";
+ my $in = $_;
no warnings qw/substr/;
- eval "\$_ = substr(\$_, $substrarg)";
+ eval "\$out .= substr(\$in, $_)" for(@substrargs);
+ die $@ if $@;
+ $_ = $out;
$_ = "" unless defined $_;
}
@@ -543,6 +661,9 @@ for(@ARGV) {
s/\s//g if $opt{W};
s/\W//g if $opt{n};
+ next if defined $opt{g} && !/$opt{g}/o;
+ next if defined $opt{v} && /$opt{v}/o;
+
if($opt{e}) {
no strict;
no warnings qw/exiting/; # so -e code can "next" to skip a record
@@ -552,6 +673,8 @@ for(@ARGV) {
next if $opt{k} && (!defined || length == 0);
$_ = "" unless defined $_;
+ print render($_) . "\n" if $opt{C};
+
$counts{$_}++;
$total++;
}
@@ -559,37 +682,39 @@ for(@ARGV) {
die "$SELF: couldn't read any input files\n" unless $readfiles;
-if($opt{T}) {
- (my ($thresh, $pct)) = ($opt{T} =~ /^(\d+)(%?)/);
- if($thresh) {
- for(keys %counts) {
- delete $counts{$_} if
- ($pct && (($counts{$_} * 100 / $total) < $thresh)) ||
- (!$pct && ($counts{$_} < $thresh));
+if(!$opt{C}) {
+ if($opt{T}) {
+ (my ($thresh, $pct)) = ($opt{T} =~ /^(\d+)(%?)/);
+ if($thresh) {
+ for(keys %counts) {
+ delete $counts{$_} if
+ ($pct && (($counts{$_} * 100 / $total) < $thresh)) ||
+ (!$pct && ($counts{$_} < $thresh));
+ }
+ } else {
+ die "$SELF: invalid argument for -T\n";
}
- } else {
- die "$SELF: invalid argument for -T\n";
}
-}
-if(!$opt{P}) {
- for(keys %counts) {
- my $l = length(render($_));
- $longest = $l if $longest < $l;
+ if(!$opt{P}) {
+ for(keys %counts) {
+ my $l = length(render($_));
+ $longest = $l if $longest < $l;
+ }
}
-}
# done reading & counting all input, show the results.
-for(sort { eval $sortcode } keys %counts) {
- print (my $printable = render($_));
- print " " x ($longest - length($printable)) unless $opt{P};
- print $opt{o} . $counts{$_} unless $opt{p};
- printf "$opt{o}%.1f%%", ($counts{$_} * 100 / $total) unless $opt{c};
- print "\n";
-}
+ for(sort { eval $sortcode } keys %counts) {
+ print (my $printable = render($_));
+ print " " x ($longest - length($printable)) unless $opt{P};
+ print $opt{o} . $counts{$_} unless $opt{p};
+ printf "$opt{o}%.1f%%", ($counts{$_} * 100 / $total) unless $opt{c};
+ print "\n";
+ }
-if($opt{t}) {
- print "\n-- Total count: $total\n";
+ if($opt{t}) {
+ print "\n-- Total count: $total\n";
+ }
}
# be like cat, exit with error status if any input file couldn't be