From b182e3f487614d87f6b79ee7d182c6f504d877af Mon Sep 17 00:00:00 2001
From: "B. Watson" <yalhcru@gmail.com>
Date: Tue, 13 Oct 2015 07:19:02 -0400
Subject: Add -l -g -v -C, doc updates

---
 bkt | 379 +++++++++++++++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 252 insertions(+), 127 deletions(-)
diff --git a/bkt b/bkt
index 0fee2f4..ffa31b6 100755
--- a/bkt
+++ b/bkt
@@ -44,9 +44,16 @@ variants of:
 
 	<shell commands> | perl -lne 's/\s.*//; $a{$_}++; END { print "$_ $a{$_}" for sort keys %a }'
 
+The above could be written as:
+
+	<shell commands> | bkt -f1
+
 =head1 OPTIONS
 
-=head2 General options
+Options that don't take arguments may be bundled: B<-BipW> is the same
+as B<-B> B<-i> B<-p> B<-W>.
+
+=head2 General Options
 
 =over
 
@@ -64,54 +71,9 @@ End of options. Everything after this is treated as a filename.
 
 =back
 
-=head2 Output options
-
-=over
-
-=item B<-c>
-
-Show counts only (suppress percentages).
-
-=item B<-p>
-
-Show percentages only (suppress counts).
--c and -p may be combined, if you can find a use for it.
-
-=item B<-t>
-
-Show total count after all item counts.
-
-=item B<-x>
-
-Print output in hexadecimal.
-
-=item B<-a>
-
-ASCII output: render non-ASCII characters as hex escapes.
-
-=item B<-s> I<opts>
-
-Output sort options. Options may include:
-
-	r - reverse sort (default is ascending)
-	a - sort alphabetically (default is by count, then alpha)
-	f - fold case
-
-=item B<-T> I<thresh[%]>
-
-Filter out results below threshold, which may be a count or a percentage, e.g. 5%.
-
-=item B<-o> I<string>
-
-Use string as output delimiter (default: \\t). Implies -P.
+=head2 Input Options
 
-=item B<-P>
-
-Don't pad output with spaces to length of longest element. The -o option enables this as well.
-
-=back
-
-=head2 Input options
+These options are applied to each input file when it's first opened for reading.
 
 =over
 
@@ -123,34 +85,70 @@ specified by the current locale. B<-B> treats the input as a stream of
 
 =item B<-r> I<recordsize>
 
-Read input as fixed-size records. This can't be combined with B<-/>.
+Read input as fixed-size records. This can't be combined with B<-/>,
+as it sets the value of B<$/> to a reference to its argument. See
+I<perldoc -v '$/'>.
 
-=item B<-/> I<sep>
+=item B<-/> I<separator>
 
 Set value of B<$/>, perl's input record separator. Default is I<\n>.
-One of I<-w> I<-W> I<-n> is highly recommended with this option. This
+One of B<-w> B<-W> B<-n> is highly recommended with this option. This
 can't be combined with B<-r>.
 
 =back
 
-=head2 Transform options
+=head2 Transform Options
+
+These options are applied to each record of input, in the order listed
+here, before it's counted. If you haven't used B<-r> or B<-/>, a record
+is a line.
+
+The B<-f> and <-b> options can be used together, which is unlike B<cut>(1).
 
 =over
 
+=item B<-l> I<range>
+
+Operate only on records numbered within the range, e.g. I<1-3> for the
+first 3 records of each file, I<10-> for 10th through the end, I<15>
+for only the 15th record. The starting record is optional and defaults
+to 1, so I<-3> could be used for the first example. If multiple files
+are given, the numbering resets to 1 at the start of each file.
+
 =item B<-d> I<delim>
 
-Delimiter for B<-f>. Default: /\\s+/ aka whitespace. This can be a fixed
+Delimiter for B<-f>. Default: /\\s+/ aka whitespace. This can be a literal
 string or a regular expression (if enclosed in //, with optional /i
-modifier). This option does nothing without B<-f>.
+modifier). As a special case, I<//> alone is treated as a literal string.
+This option does nothing without B<-f>. B<-d>'s argument is used with
+perl's B<split>, so you might read I<perdoc -f split> to understand this.
 
 =item B<-f> I<field>
 
-Consider only this (B<-d> delimiter separated) field.
+Consider only this (B<-d> delimiter separated) field. Unlike B<cut>,
+only one field may be selected (B<cut>'s lists of fields are not
+supported). Also unlike B<cut>, negative field numbers can be used to
+index from the rightmost field (which is numbered B<-1>).
 
 =item B<-b> I<range>
 
-Consider only a range of characters (or bytes, if B<-B>) in each record.
-Example: I<1-3> for the first 3 bytes/chars of each input record.
+Like B<cut -b>: Consider only a range of characters (or bytes, if B<-B>)
+in each record. Example: I<1-3> for the first 3 bytes/chars of each
+input record.
+
+B<-b> supports the same types of range as cut(1):
+
+	N      N'th byte/character, counted from 1
+	N-     from N'th byte/character to end of record
+	N-M    from N'th to M'th (included) byte/character
+	-M     from first to M'th (included) byte/character
+
+...plus 2 extra types:
+
+	-M-    from Mth-to-last byte/character (included) to end of record (-1 = last)
+	-M-N   from Mth-to-last byte/character to Nth-to-last (included)
+
+...including multiple ranges separated by commas.
 
 =item B<-i>
 
@@ -169,10 +167,20 @@ Remove ALL whitespace from input records.
 
 Remove all non-word (I<\W>) characters from input records.
 
+=item B<-g> I<regex>
+
+Grep for regex. Equivalent to: B<-e 'next unless /regex/'>. Remember
+this is a perl-style regex, not a B<grep>(1) one.
+
+=item B<-v> I<regex>
+
+Grep for records not containing regex. Equivalent to: B<-e 'next if /regex/'>.
+
 =item B<-e> I<code>
 
 Execute perl code for each input record. The code should modify B<$_>.
-Make sure you quote the argument as needed by your shell.
+Make sure you quote the argument as needed by your shell. See NOTES
+below for more information.
 
 =item B<-k>
 
@@ -188,47 +196,79 @@ Letter frequency count. Alias for B<-inkr1>.
 
 =back
 
-Options that don't take arguments may be bundled: B<-BipW> is the same as
-B<-B> B<-i> B<-p> B<-W>.
+=head2 Output Options
 
-Input will be read from filenames given on the command line, or
-from standard input if none given, or if the filename B<-> (hyphen) is
-given. Use B<./-> to read file a real file named B<->. The input need
-not be sorted. The output will always be sorted.
+These options are applied after all I/O and counting is done, and
+only affect how the output is printed.
 
-Each input record is chomped before any further processing.
+=over
 
-B<-b> is like the B<-b> or B<-c> option to cut(1) (depending on whether B<-B> is
-set). It supports the same types of range as cut(1):
+=item B<-c>
 
-	N      N'th byte/character, counted from 1
-	N-     from N'th byte/character to end of record
-	N-M    from N'th to M'th (included) byte/character
-	-M     from first to M'th (included) byte/character
+Show counts only (suppress percentages).
 
-...plus 2 extra types:
+=item B<-p>
 
-	-M-    from Mth-to-last byte/character to end of record (-1 = last)
-	-M-N   from Mth-to-last byte/characters to Nth-to-last
+Show percentages only (suppress counts).
+B<-c> and B<-p> may be combined, if you can find a use for it.
 
-...except that cut allows many ranges separated by commas, while B<bkt>
-B<-b> only allows a single range.
+=item B<-t>
+
+Show total count after all item counts.
+
+=item B<-C>
+
+Print the records themselves, instead of the counts. Allows B<bkt> to
+be used as a general-purpose text manipulation tool. Mainly this option
+was implemented for debugging purposes, but it might be useful for other
+stuff. When B<-C> is used, B<-a> and B<-x> still work, but none of the
+other output options have any effect.
+
+=item B<-x>
+
+Print output records as hexadecimal.
+
+=item B<-a>
+
+ASCII output: render non-ASCII characters as hex escapes.
+
+=item B<-s> I<sortopts>
+
+Output sort options. Options may include:
+
+	r - reverse sort (default is ascending)
+	a - sort alphabetically (default is by count, then alpha)
+	f - fold case
 
-B<-d> is like the the -d option to cut(1), except that the delimiter can
-be multiple characters. Also, the delimiter is treated as a regular
-expression if it's at least 3 characters long *and* enclosed in I<//>. The
-/i modifier is supported, but none of the other /x regex modifiers are.
+=item B<-T> I<thresh[%]>
+
+Filter out results below threshold, which may be a count or a percentage, e.g. 5%.
+
+=item B<-o> I<string>
 
-B<-f> like cut's B<-f>, except that it only allows a single field number (not
-a list), which is indexed starting from 1 (same as cut)... or a negative
-number, meaning the Nth field from the right (-1 = rightmost). Also
-unlike cut, B<-f> and B<-b> may be combined (B<-f> is applied first).
+Use string as output delimiter (default: \\t). Implies B<-P>.
+
+=item B<-P>
+
+Don't pad output with spaces to length of longest element.
+The B<-o> option enables this as well.
+
+=back
 
-The B<-f> B<-b> B<-i> B<-w> B<-W> B<-n> B<-e>I<code> B<-k> options will
-be processed in the order listed here, regardless of the order they're
-given on the command line. In particular, this means the code for B<-e>
-will see B<$_> *after* it's been modified by any of the other options
-(except B<-k>).
+=head1 NOTES
+
+Input will be read from filenames given on the command line, or
+from standard input if none given, or if the filename B<-> (hyphen) is
+given. Use B<./-> to read file a real file named B<->. The input need
+not be sorted. The output will always be sorted.
+
+Each input record is chomped before any further processing.
+
+The B<-l> B<-f> B<-b> B<-i> B<-w> B<-W> B<-n> B<-g> B<-v> B<-e>I<code>
+B<-k> options will be applied to each record in the order listed here,
+regardless of the order they're given on the command line. In particular,
+this means the code for B<-e> will see B<$_> *after* it's been modified
+by any of the other options (except B<-k>).
 
 The code for B<-e> will run with strict disabled and warnings enabled. To
 disable warnings, prefix the code with 'no warnings;'. There can only
@@ -244,6 +284,17 @@ The astute reader will have noticed that all the other transform options
 could be written as code for B<-e>. This is correct: the other options
 exist to support lazy typists such as the author.
 
+=head1 EXIT STATUS
+
+B<bkt> exits with zero (success) status if all operations were successful,
+otherwise non-zero. Currently, there are no specific non-zero exit codes
+(e.g. different ones for different error types), though this may change
+in the future.
+
+If some files couldn't be read, but at least one could, the file(s)
+that were readable are processed normally and the exit status will be
+non-zero. This mimics the behaviour of GNU cat(1), head(1), tail(1), etc.
+
 =head1 EXAMPLES
 
 Show the percentage of binaries that start with each letter/number/etc,
@@ -255,26 +306,65 @@ Show the percentage of binaries that start with each letter/number/etc,
 	ls | bkt -e '$_=substr($_,0,1)'
 	ls | bkt -e 's,^(.).*,$1,'
 
+--
+
 Show percentages of lines said by each user in an irssi IRC log. Relies
 on the log format having a timestamp, space, <nick> for normal lines.
 Misses /me actions entirely though. Add -sr to show the most talkative
 first.
 
-	bkt -f2 -e 'next unless /^\</' channelname.log
+	bkt -f2 -g'<' channelname.log
+
+--
+
+We have a directory full of scripts, a mix of perl, python, shell, etc.
+How many scripts are written in each language? Skip any files that
+don't start with a #! (shebang) line.
+
+Using B<bkt> only:
+
+   bkt -l1 -g'#!' scripts/*
+
+Which is equivalent to:
+
+   head -qn1 scripts/* | grep '#!' | bkt
+
+If you wanted to make the same assumption the OS does, that a script
+missing its shebang line is a #!/bin/sh script:
+
+	bkt -l1 -e'$_="#!/bin/sh" unless /#!/' scripts/*
+
+If some of them might be ELF executables, add "-B -v ELF" to the above.
+
+--
 
 Show us how many users use each shell (including stuff like /bin/false).
 
 	bkt -d: -f-1 /etc/passwd
 
+--
+
 How many images of each type have we got? Ignore case, so JPG and jpg
 are counted together.
 
 	ls ~/images/*.* | bkt -i -d. -f-1
 
+The above could have been written as:
+
+	ls ~/images/*.* | tr A-Z a-z | cut -d. -f2 | bkt
+
+...except it wouldn't handle filenames with multiple dots in them, like
+image.01.jpg. Replacing the cut with "sed 's,.*\.,,'" would fix that,
+but it's still a lot more keystrokes.
+
+--
+
 What percentage of words in a text file are capitalized?
 
 	bkt -n/' ' -e's/^[A-Z]+$/CAPS/ || s/^[A-Z].*$/Caps/ || s/^[a-z].*$/lower/ || next' file.txt
 
+--
+
 Given a CSV file with fields lastname, firstname, phonenumber:
 
 	Blow,Joe,444-555-0123
@@ -283,7 +373,9 @@ Given a CSV file with fields lastname, firstname, phonenumber:
 
 ...to get a breakdown by area code:
 
-   bkt -d, -f2 -b1-3
+   bkt -d, -f3 -b1-3 phonelist.csv
+
+--
 
 Suppose you have a team of people working on a large C++ or Java codebase.
 By convention, TODO comments are written as:
@@ -302,6 +394,11 @@ summary of these with:
    jane   23   28.4%
    bob    37   45.7%
 
+If you're more comfortable with sed than perl, you could write the
+above as:
+
+  find . -name '*.c++' | xargs sed -n 's,.*//  *TODO  *\([^:]*\):.*,\1,p' | bkt
+
 =head1 AUTHOR
 
 B. Watson <yalhcru@gmail.com>
@@ -375,7 +472,7 @@ sub render {
 }
 
 # main()
-getopts('hcpiwWte:d:f:b:xao:Bs:T:P/:nkFr:L', \our %opt);
+getopts('hcpiwWte:d:f:b:xao:Bs:T:P/:nkFr:Ll:g:v:C', \our %opt);
 
 # -h == --help
 HELP_MESSAGE(), exit(0) if $opt{h};
@@ -437,17 +534,31 @@ if(defined $opt{d}) {
 }
 
 # handle -b arg
-our $substrarg;
+our @substrargs;
 if(defined $opt{b}) {
-	for($opt{b}) {
-		/^(\d+)$/ && do        { $substrarg = "$1 - 1, 1" };
-		/^(\d+)-$/ && do       { $substrarg = "$1 - 1" };
-		/^-(\d+)$/ && do       { $substrarg = "0, $1" };
-		/^(\d+)-(\d+)$/ && do  { $substrarg = "$1 - 1, " . ($2 - $1 + 1) };
-		/^-(\d+)-$/ && do      { $substrarg = "$1"; };
-		/^-(\d+)-(\d+)$/ && do { $substrarg = "$1, " . ($2 - $1); };
+	my $s;
+	for(split /,/, $opt{b}) {
+		/^(\d+)$/ && do        { $s = "$1 - 1, 1" };
+		/^(\d+)-$/ && do       { $s = "$1 - 1" };
+		/^-(\d+)$/ && do       { $s = "0, $1" };
+		/^(\d+)-(\d+)$/ && do  { $s = "$1 - 1, " . ($2 - $1 + 1) };
+		/^-(\d+)-$/ && do      { $s = "$1"; };
+		/^-(\d+)-(\d+)$/ && do { $s = "$1, " . ($2 - $1); };
+		die "$SELF: invalid -b argument\n" unless $s;
+		push @substrargs, $s;
+	}
+}
+
+# handle -l arg. similar to but simpler than -b arg.
+our ($startrec, $endrec);
+if(defined $opt{l}) {
+	for($opt{l}) {
+		/^(\d+)$/ && do       { $startrec = $endrec = $1 };
+		/^(\d+)-$/ && do      { $startrec = $1 };
+		/^-(\d+)$/ && do      { $startrec = 1; $endrec = $1 };
+		/^(\d+)-(\d+)$/ && do { $startrec = $1 ; $endrec = $2 };
+		die "$SELF: invalid -l argument\n" unless $startrec;
 	}
-	die "$SELF: invalid -b argument\n" unless $substrarg;
 }
 
 # -f index starts at 1, perl arrays are indexed from 0, fix (but
@@ -522,6 +633,9 @@ for(@ARGV) {
 	$readfiles++;
 
 	while(<$fh>) {
+		next if defined $startrec && $. < $startrec;
+		next if defined $endrec   && $. > $endrec;
+
 		chomp;
 
 		# behave like cut for -b/-f: no warnings if -f3 but only 2 fields exist,
@@ -532,9 +646,13 @@ for(@ARGV) {
 			$_ = "" unless defined $_;
 		}
 
-		if($substrarg) { # set via $opt{b}
+		if(@substrargs) { # set via $opt{b}
+			my $out = "";
+			my $in = $_;
 			no warnings qw/substr/;
-			eval "\$_ = substr(\$_, $substrarg)";
+			eval "\$out .= substr(\$in, $_)" for(@substrargs);
+			die $@ if $@;
+			$_ = $out;
 			$_ = "" unless defined $_;
 		}
 
@@ -543,6 +661,9 @@ for(@ARGV) {
 		s/\s//g if $opt{W};
 		s/\W//g if $opt{n};
 
+		next if defined $opt{g} && !/$opt{g}/o;
+		next if defined $opt{v} && /$opt{v}/o;
+
 		if($opt{e}) {
 			no strict;
 			no warnings qw/exiting/; # so -e code can "next" to skip a record
@@ -552,6 +673,8 @@ for(@ARGV) {
 		next if $opt{k} && (!defined || length == 0);
 		$_ = "" unless defined $_;
 
+		print render($_) . "\n" if $opt{C};
+
 		$counts{$_}++;
 		$total++;
 	}
@@ -559,37 +682,39 @@ for(@ARGV) {
 
 die "$SELF: couldn't read any input files\n" unless $readfiles;
 
-if($opt{T}) {
-	(my ($thresh, $pct)) = ($opt{T} =~ /^(\d+)(%?)/);
-	if($thresh) {
-		for(keys %counts) {
-			delete $counts{$_} if
-				($pct && (($counts{$_} * 100 / $total) < $thresh)) ||
-				(!$pct && ($counts{$_} < $thresh));
+if(!$opt{C}) {
+	if($opt{T}) {
+		(my ($thresh, $pct)) = ($opt{T} =~ /^(\d+)(%?)/);
+		if($thresh) {
+			for(keys %counts) {
+				delete $counts{$_} if
+					($pct && (($counts{$_} * 100 / $total) < $thresh)) ||
+					(!$pct && ($counts{$_} < $thresh));
+			}
+		} else {
+			die "$SELF: invalid argument for -T\n";
 		}
-	} else {
-		die "$SELF: invalid argument for -T\n";
 	}
-}
 
-if(!$opt{P}) {
-	for(keys %counts) {
-		my $l = length(render($_));
-		$longest = $l if $longest < $l;
+	if(!$opt{P}) {
+		for(keys %counts) {
+			my $l = length(render($_));
+			$longest = $l if $longest < $l;
+		}
 	}
-}
 
 # done reading & counting all input, show the results.
-for(sort { eval $sortcode } keys %counts) {
-	print (my $printable = render($_));
-	print " " x ($longest - length($printable)) unless $opt{P};
-	print $opt{o} . $counts{$_} unless $opt{p};
-	printf "$opt{o}%.1f%%", ($counts{$_} * 100 / $total) unless $opt{c};
-	print "\n";
-}
+	for(sort { eval $sortcode } keys %counts) {
+		print (my $printable = render($_));
+		print " " x ($longest - length($printable)) unless $opt{P};
+		print $opt{o} . $counts{$_} unless $opt{p};
+		printf "$opt{o}%.1f%%", ($counts{$_} * 100 / $total) unless $opt{c};
+		print "\n";
+	}
 
-if($opt{t}) {
-	print "\n-- Total count: $total\n";
+	if($opt{t}) {
+		print "\n-- Total count: $total\n";
+	}
 }
 
 # be like cat, exit with error status if any input file couldn't be
-- 
cgit v1.2.3