From b182e3f487614d87f6b79ee7d182c6f504d877af Mon Sep 17 00:00:00 2001 From: "B. Watson" Date: Tue, 13 Oct 2015 07:19:02 -0400 Subject: Add -l -g -v -C, doc updates --- bkt | 379 +++++++++++++++++++++++++++++++++++++++++++++----------------------- 1 file changed, 252 insertions(+), 127 deletions(-) diff --git a/bkt b/bkt index 0fee2f4..ffa31b6 100755 --- a/bkt +++ b/bkt @@ -44,9 +44,16 @@ variants of: | perl -lne 's/\s.*//; $a{$_}++; END { print "$_ $a{$_}" for sort keys %a }' +The above could be written as: + + | bkt -f1 + =head1 OPTIONS -=head2 General options +Options that don't take arguments may be bundled: B<-BipW> is the same +as B<-B> B<-i> B<-p> B<-W>. + +=head2 General Options =over @@ -64,54 +71,9 @@ End of options. Everything after this is treated as a filename. =back -=head2 Output options - -=over - -=item B<-c> - -Show counts only (suppress percentages). - -=item B<-p> - -Show percentages only (suppress counts). --c and -p may be combined, if you can find a use for it. - -=item B<-t> - -Show total count after all item counts. - -=item B<-x> - -Print output in hexadecimal. - -=item B<-a> - -ASCII output: render non-ASCII characters as hex escapes. - -=item B<-s> I - -Output sort options. Options may include: - - r - reverse sort (default is ascending) - a - sort alphabetically (default is by count, then alpha) - f - fold case - -=item B<-T> I - -Filter out results below threshold, which may be a count or a percentage, e.g. 5%. - -=item B<-o> I - -Use string as output delimiter (default: \\t). Implies -P. +=head2 Input Options -=item B<-P> - -Don't pad output with spaces to length of longest element. The -o option enables this as well. - -=back - -=head2 Input options +These options are applied to each input file when it's first opened for reading. =over @@ -123,34 +85,70 @@ specified by the current locale. B<-B> treats the input as a stream of =item B<-r> I -Read input as fixed-size records. This can't be combined with B<-/>. +Read input as fixed-size records. This can't be combined with B<-/>, +as it sets the value of B<$/> to a reference to its argument. See +I. -=item B<-/> I +=item B<-/> I Set value of B<$/>, perl's input record separator. Default is I<\n>. -One of I<-w> I<-W> I<-n> is highly recommended with this option. This +One of B<-w> B<-W> B<-n> is highly recommended with this option. This can't be combined with B<-r>. =back -=head2 Transform options +=head2 Transform Options + +These options are applied to each record of input, in the order listed +here, before it's counted. If you haven't used B<-r> or B<-/>, a record +is a line. + +The B<-f> and <-b> options can be used together, which is unlike B(1). =over +=item B<-l> I + +Operate only on records numbered within the range, e.g. I<1-3> for the +first 3 records of each file, I<10-> for 10th through the end, I<15> +for only the 15th record. The starting record is optional and defaults +to 1, so I<-3> could be used for the first example. If multiple files +are given, the numbering resets to 1 at the start of each file. + =item B<-d> I -Delimiter for B<-f>. Default: /\\s+/ aka whitespace. This can be a fixed +Delimiter for B<-f>. Default: /\\s+/ aka whitespace. This can be a literal string or a regular expression (if enclosed in //, with optional /i -modifier). This option does nothing without B<-f>. +modifier). As a special case, I alone is treated as a literal string. +This option does nothing without B<-f>. B<-d>'s argument is used with +perl's B, so you might read I to understand this. =item B<-f> I -Consider only this (B<-d> delimiter separated) field. +Consider only this (B<-d> delimiter separated) field. Unlike B, +only one field may be selected (B's lists of fields are not +supported). Also unlike B, negative field numbers can be used to +index from the rightmost field (which is numbered B<-1>). =item B<-b> I -Consider only a range of characters (or bytes, if B<-B>) in each record. -Example: I<1-3> for the first 3 bytes/chars of each input record. +Like B: Consider only a range of characters (or bytes, if B<-B>) +in each record. Example: I<1-3> for the first 3 bytes/chars of each +input record. + +B<-b> supports the same types of range as cut(1): + + N N'th byte/character, counted from 1 + N- from N'th byte/character to end of record + N-M from N'th to M'th (included) byte/character + -M from first to M'th (included) byte/character + +...plus 2 extra types: + + -M- from Mth-to-last byte/character (included) to end of record (-1 = last) + -M-N from Mth-to-last byte/character to Nth-to-last (included) + +...including multiple ranges separated by commas. =item B<-i> @@ -169,10 +167,20 @@ Remove ALL whitespace from input records. Remove all non-word (I<\W>) characters from input records. +=item B<-g> I + +Grep for regex. Equivalent to: B<-e 'next unless /regex/'>. Remember +this is a perl-style regex, not a B(1) one. + +=item B<-v> I + +Grep for records not containing regex. Equivalent to: B<-e 'next if /regex/'>. + =item B<-e> I Execute perl code for each input record. The code should modify B<$_>. -Make sure you quote the argument as needed by your shell. +Make sure you quote the argument as needed by your shell. See NOTES +below for more information. =item B<-k> @@ -188,47 +196,79 @@ Letter frequency count. Alias for B<-inkr1>. =back -Options that don't take arguments may be bundled: B<-BipW> is the same as -B<-B> B<-i> B<-p> B<-W>. +=head2 Output Options -Input will be read from filenames given on the command line, or -from standard input if none given, or if the filename B<-> (hyphen) is -given. Use B<./-> to read file a real file named B<->. The input need -not be sorted. The output will always be sorted. +These options are applied after all I/O and counting is done, and +only affect how the output is printed. -Each input record is chomped before any further processing. +=over -B<-b> is like the B<-b> or B<-c> option to cut(1) (depending on whether B<-B> is -set). It supports the same types of range as cut(1): +=item B<-c> - N N'th byte/character, counted from 1 - N- from N'th byte/character to end of record - N-M from N'th to M'th (included) byte/character - -M from first to M'th (included) byte/character +Show counts only (suppress percentages). -...plus 2 extra types: +=item B<-p> - -M- from Mth-to-last byte/character to end of record (-1 = last) - -M-N from Mth-to-last byte/characters to Nth-to-last +Show percentages only (suppress counts). +B<-c> and B<-p> may be combined, if you can find a use for it. -...except that cut allows many ranges separated by commas, while B -B<-b> only allows a single range. +=item B<-t> + +Show total count after all item counts. + +=item B<-C> + +Print the records themselves, instead of the counts. Allows B to +be used as a general-purpose text manipulation tool. Mainly this option +was implemented for debugging purposes, but it might be useful for other +stuff. When B<-C> is used, B<-a> and B<-x> still work, but none of the +other output options have any effect. + +=item B<-x> + +Print output records as hexadecimal. + +=item B<-a> + +ASCII output: render non-ASCII characters as hex escapes. + +=item B<-s> I + +Output sort options. Options may include: + + r - reverse sort (default is ascending) + a - sort alphabetically (default is by count, then alpha) + f - fold case -B<-d> is like the the -d option to cut(1), except that the delimiter can -be multiple characters. Also, the delimiter is treated as a regular -expression if it's at least 3 characters long *and* enclosed in I. The -/i modifier is supported, but none of the other /x regex modifiers are. +=item B<-T> I + +Filter out results below threshold, which may be a count or a percentage, e.g. 5%. + +=item B<-o> I -B<-f> like cut's B<-f>, except that it only allows a single field number (not -a list), which is indexed starting from 1 (same as cut)... or a negative -number, meaning the Nth field from the right (-1 = rightmost). Also -unlike cut, B<-f> and B<-b> may be combined (B<-f> is applied first). +Use string as output delimiter (default: \\t). Implies B<-P>. + +=item B<-P> + +Don't pad output with spaces to length of longest element. +The B<-o> option enables this as well. + +=back -The B<-f> B<-b> B<-i> B<-w> B<-W> B<-n> B<-e>I B<-k> options will -be processed in the order listed here, regardless of the order they're -given on the command line. In particular, this means the code for B<-e> -will see B<$_> *after* it's been modified by any of the other options -(except B<-k>). +=head1 NOTES + +Input will be read from filenames given on the command line, or +from standard input if none given, or if the filename B<-> (hyphen) is +given. Use B<./-> to read file a real file named B<->. The input need +not be sorted. The output will always be sorted. + +Each input record is chomped before any further processing. + +The B<-l> B<-f> B<-b> B<-i> B<-w> B<-W> B<-n> B<-g> B<-v> B<-e>I +B<-k> options will be applied to each record in the order listed here, +regardless of the order they're given on the command line. In particular, +this means the code for B<-e> will see B<$_> *after* it's been modified +by any of the other options (except B<-k>). The code for B<-e> will run with strict disabled and warnings enabled. To disable warnings, prefix the code with 'no warnings;'. There can only @@ -244,6 +284,17 @@ The astute reader will have noticed that all the other transform options could be written as code for B<-e>. This is correct: the other options exist to support lazy typists such as the author. +=head1 EXIT STATUS + +B exits with zero (success) status if all operations were successful, +otherwise non-zero. Currently, there are no specific non-zero exit codes +(e.g. different ones for different error types), though this may change +in the future. + +If some files couldn't be read, but at least one could, the file(s) +that were readable are processed normally and the exit status will be +non-zero. This mimics the behaviour of GNU cat(1), head(1), tail(1), etc. + =head1 EXAMPLES Show the percentage of binaries that start with each letter/number/etc, @@ -255,26 +306,65 @@ Show the percentage of binaries that start with each letter/number/etc, ls | bkt -e '$_=substr($_,0,1)' ls | bkt -e 's,^(.).*,$1,' +-- + Show percentages of lines said by each user in an irssi IRC log. Relies on the log format having a timestamp, space, for normal lines. Misses /me actions entirely though. Add -sr to show the most talkative first. - bkt -f2 -e 'next unless /^\ only: + + bkt -l1 -g'#!' scripts/* + +Which is equivalent to: + + head -qn1 scripts/* | grep '#!' | bkt + +If you wanted to make the same assumption the OS does, that a script +missing its shebang line is a #!/bin/sh script: + + bkt -l1 -e'$_="#!/bin/sh" unless /#!/' scripts/* + +If some of them might be ELF executables, add "-B -v ELF" to the above. + +-- Show us how many users use each shell (including stuff like /bin/false). bkt -d: -f-1 /etc/passwd +-- + How many images of each type have we got? Ignore case, so JPG and jpg are counted together. ls ~/images/*.* | bkt -i -d. -f-1 +The above could have been written as: + + ls ~/images/*.* | tr A-Z a-z | cut -d. -f2 | bkt + +...except it wouldn't handle filenames with multiple dots in them, like +image.01.jpg. Replacing the cut with "sed 's,.*\.,,'" would fix that, +but it's still a lot more keystrokes. + +-- + What percentage of words in a text file are capitalized? bkt -n/' ' -e's/^[A-Z]+$/CAPS/ || s/^[A-Z].*$/Caps/ || s/^[a-z].*$/lower/ || next' file.txt +-- + Given a CSV file with fields lastname, firstname, phonenumber: Blow,Joe,444-555-0123 @@ -283,7 +373,9 @@ Given a CSV file with fields lastname, firstname, phonenumber: ...to get a breakdown by area code: - bkt -d, -f2 -b1-3 + bkt -d, -f3 -b1-3 phonelist.csv + +-- Suppose you have a team of people working on a large C++ or Java codebase. By convention, TODO comments are written as: @@ -302,6 +394,11 @@ summary of these with: jane 23 28.4% bob 37 45.7% +If you're more comfortable with sed than perl, you could write the +above as: + + find . -name '*.c++' | xargs sed -n 's,.*// *TODO *\([^:]*\):.*,\1,p' | bkt + =head1 AUTHOR B. Watson @@ -375,7 +472,7 @@ sub render { } # main() -getopts('hcpiwWte:d:f:b:xao:Bs:T:P/:nkFr:L', \our %opt); +getopts('hcpiwWte:d:f:b:xao:Bs:T:P/:nkFr:Ll:g:v:C', \our %opt); # -h == --help HELP_MESSAGE(), exit(0) if $opt{h}; @@ -437,17 +534,31 @@ if(defined $opt{d}) { } # handle -b arg -our $substrarg; +our @substrargs; if(defined $opt{b}) { - for($opt{b}) { - /^(\d+)$/ && do { $substrarg = "$1 - 1, 1" }; - /^(\d+)-$/ && do { $substrarg = "$1 - 1" }; - /^-(\d+)$/ && do { $substrarg = "0, $1" }; - /^(\d+)-(\d+)$/ && do { $substrarg = "$1 - 1, " . ($2 - $1 + 1) }; - /^-(\d+)-$/ && do { $substrarg = "$1"; }; - /^-(\d+)-(\d+)$/ && do { $substrarg = "$1, " . ($2 - $1); }; + my $s; + for(split /,/, $opt{b}) { + /^(\d+)$/ && do { $s = "$1 - 1, 1" }; + /^(\d+)-$/ && do { $s = "$1 - 1" }; + /^-(\d+)$/ && do { $s = "0, $1" }; + /^(\d+)-(\d+)$/ && do { $s = "$1 - 1, " . ($2 - $1 + 1) }; + /^-(\d+)-$/ && do { $s = "$1"; }; + /^-(\d+)-(\d+)$/ && do { $s = "$1, " . ($2 - $1); }; + die "$SELF: invalid -b argument\n" unless $s; + push @substrargs, $s; + } +} + +# handle -l arg. similar to but simpler than -b arg. +our ($startrec, $endrec); +if(defined $opt{l}) { + for($opt{l}) { + /^(\d+)$/ && do { $startrec = $endrec = $1 }; + /^(\d+)-$/ && do { $startrec = $1 }; + /^-(\d+)$/ && do { $startrec = 1; $endrec = $1 }; + /^(\d+)-(\d+)$/ && do { $startrec = $1 ; $endrec = $2 }; + die "$SELF: invalid -l argument\n" unless $startrec; } - die "$SELF: invalid -b argument\n" unless $substrarg; } # -f index starts at 1, perl arrays are indexed from 0, fix (but @@ -522,6 +633,9 @@ for(@ARGV) { $readfiles++; while(<$fh>) { + next if defined $startrec && $. < $startrec; + next if defined $endrec && $. > $endrec; + chomp; # behave like cut for -b/-f: no warnings if -f3 but only 2 fields exist, @@ -532,9 +646,13 @@ for(@ARGV) { $_ = "" unless defined $_; } - if($substrarg) { # set via $opt{b} + if(@substrargs) { # set via $opt{b} + my $out = ""; + my $in = $_; no warnings qw/substr/; - eval "\$_ = substr(\$_, $substrarg)"; + eval "\$out .= substr(\$in, $_)" for(@substrargs); + die $@ if $@; + $_ = $out; $_ = "" unless defined $_; } @@ -543,6 +661,9 @@ for(@ARGV) { s/\s//g if $opt{W}; s/\W//g if $opt{n}; + next if defined $opt{g} && !/$opt{g}/o; + next if defined $opt{v} && /$opt{v}/o; + if($opt{e}) { no strict; no warnings qw/exiting/; # so -e code can "next" to skip a record @@ -552,6 +673,8 @@ for(@ARGV) { next if $opt{k} && (!defined || length == 0); $_ = "" unless defined $_; + print render($_) . "\n" if $opt{C}; + $counts{$_}++; $total++; } @@ -559,37 +682,39 @@ for(@ARGV) { die "$SELF: couldn't read any input files\n" unless $readfiles; -if($opt{T}) { - (my ($thresh, $pct)) = ($opt{T} =~ /^(\d+)(%?)/); - if($thresh) { - for(keys %counts) { - delete $counts{$_} if - ($pct && (($counts{$_} * 100 / $total) < $thresh)) || - (!$pct && ($counts{$_} < $thresh)); +if(!$opt{C}) { + if($opt{T}) { + (my ($thresh, $pct)) = ($opt{T} =~ /^(\d+)(%?)/); + if($thresh) { + for(keys %counts) { + delete $counts{$_} if + ($pct && (($counts{$_} * 100 / $total) < $thresh)) || + (!$pct && ($counts{$_} < $thresh)); + } + } else { + die "$SELF: invalid argument for -T\n"; } - } else { - die "$SELF: invalid argument for -T\n"; } -} -if(!$opt{P}) { - for(keys %counts) { - my $l = length(render($_)); - $longest = $l if $longest < $l; + if(!$opt{P}) { + for(keys %counts) { + my $l = length(render($_)); + $longest = $l if $longest < $l; + } } -} # done reading & counting all input, show the results. -for(sort { eval $sortcode } keys %counts) { - print (my $printable = render($_)); - print " " x ($longest - length($printable)) unless $opt{P}; - print $opt{o} . $counts{$_} unless $opt{p}; - printf "$opt{o}%.1f%%", ($counts{$_} * 100 / $total) unless $opt{c}; - print "\n"; -} + for(sort { eval $sortcode } keys %counts) { + print (my $printable = render($_)); + print " " x ($longest - length($printable)) unless $opt{P}; + print $opt{o} . $counts{$_} unless $opt{p}; + printf "$opt{o}%.1f%%", ($counts{$_} * 100 / $total) unless $opt{c}; + print "\n"; + } -if($opt{t}) { - print "\n-- Total count: $total\n"; + if($opt{t}) { + print "\n-- Total count: $total\n"; + } } # be like cat, exit with error status if any input file couldn't be -- cgit v1.2.3