aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorB. Watson <yalhcru@gmail.com>2015-10-12 22:46:42 -0400
committerB. Watson <yalhcru@gmail.com>2015-10-12 22:46:42 -0400
commitc76bc179d5cdecc6a73a88f8ee47ff474e098e79 (patch)
tree0126213ab401fd10fc7bb5b617f4f02e32c7542c
parent440229f91b4eee243b7de550fa70e8a3b3f64600 (diff)
downloadmisc-scripts-c76bc179d5cdecc6a73a88f8ee47ff474e098e79.tar.gz
enable -B when LANG=C, add -a option
-rwxr-xr-xbkt124
1 files changed, 95 insertions, 29 deletions
diff --git a/bkt b/bkt
index 4dda169..8eb1f86 100755
--- a/bkt
+++ b/bkt
@@ -1,8 +1,11 @@
#!/usr/bin/perl
+# by popular demand:
use warnings;
use strict;
+# I wish there were a way to do this conditionally.
+# no, this didn't work: require 'open.pm'; ::open->import(':locale', ':std');
use open ":locale", ":std";
use Getopt::Std;
@@ -49,6 +52,7 @@ Output options:
-c and -p may be combined, if you can find a use for it
-t show total count
-x print output in hexadecimal
+ -a ASCII output: render non-ASCII characters as hex escapes
-s opts output sort options. opts may include:
r - reverse sort (default is ascending)
a - sort alphabetically (default is by count)
@@ -61,6 +65,7 @@ Output options:
Input options:
-B binary mode (default: input is characters in current locale)
+ -r int read input as fixed-size records (can't combine with -/)
-/ sep set value of \$/, perl's input record separator. default is \\n.
one of -w -W -n is highly recommended with this option.
-b range consider only a range of chars/bytes in each record (e.g. 1-3)
@@ -74,6 +79,7 @@ Input options:
make sure you quote the argument as needed by your shell)
-k skip blank records
-F word frequency count. alias for -ink/' '
+ -L letter frequency count. alias for -inkr1
Options that don't take arguments may be bundled: -BipW is the same as
-B -i -p -W.
@@ -139,11 +145,40 @@ sub hexify {
return join(" ", @out);
}
+# this bit of 300 baud linenoise replaces all non-ascii characters
+# with {\x00} style hex escapes. best viewed with a colorful syntax
+# highlighting editor (I like vim).
+sub asciify {
+ my $in = shift;
+
+ $in =~
+ s|
+ ( # capture to $1...
+ [^\x20-\x7e] # anything outside the range 0x20 - 0x7e
+ )
+ | # replace with {\xXX}:
+ "{\\x" .
+ sprintf("%02x", ord($1)) .
+ "}"
+ |gex;
+
+ return $in;
+}
+
+# sub render, sub render, but don't give yourself awaaaayyy
+sub render {
+ our %opt;
+ my $in = shift;
+ return hexify($in) if $opt{x};
+ return asciify($in) if $opt{a};
+ return $in;
+}
+
# main()
-our %opt;
-getopts('hcpiwWte:d:f:b:xo:Bs:T:P/:nkF', \%opt);
+getopts('hcpiwWte:d:f:b:xao:Bs:T:P/:nkFr:L', \our %opt);
-HELP_MESSAGE() if $opt{h};
+# -h == --help
+HELP_MESSAGE(), exit(0) if $opt{h};
# -F = -ink/' '
if($opt{F}) {
@@ -151,18 +186,41 @@ if($opt{F}) {
$opt{n} = $opt{k} = $opt{i} = 1;
}
-# use an eval here so we can handle escapes like \t \n \r
+# -L = -inkr1
+if($opt{L}) {
+ $opt{r} = 1;
+ $opt{n} = $opt{k} = $opt{i} = 1;
+}
+
+
+# use an eval for -/ so we can handle escapes like \t \n \r
if(defined($opt{'/'})) {
- $opt{'/'} =~ s/'/\\'/g; # to allow -/"'"
- eval "\$/ = '" . $opt{'/'} . "'";
+ if(defined($opt{r})) {
+ warn "$SELF: -r and -/ given; -/ ignored\n";
+ } else {
+ $opt{'/'} =~ s/'/\\'/g; # to allow -/"'"
+ eval "\$/ = '" . $opt{'/'} . "'";
+ }
+}
+
+# -r also uses $/
+if(defined($opt{r})) {
+ die "$SELF: -r argument must be positive integer\n"
+ unless $opt{r} =~ /^\d+$/;
+ $/ = \$opt{r};
}
+# -o implies -P
if(defined $opt{o}) {
$opt{P} = 1;
} else {
$opt{o} = "\t";
}
+# -cp implies -P too
+$opt{P}++ if $opt{c} && $opt{p};
+
+# handle -d arg. we only support the /i modifier when -d/regex/.
if(defined $opt{d}) {
if($opt{d} =~ m(^/(.+)/(i?)$)) {
# qr/$1/$2 is a syntax error, so:
@@ -178,6 +236,7 @@ if(defined $opt{d}) {
$opt{d} = qr/\s+/;
}
+# handle -b arg
our $substrarg;
if(defined $opt{b}) {
for($opt{b}) {
@@ -195,35 +254,42 @@ if(defined $opt{b}) {
# don't break using -1 for rightmost field)
$opt{f}-- if defined $opt{f} && $opt{f} > 0;
-our ($revsort, $foldsort, $alphasort, $sortcode);
-
+# handle the various -s sub-options
+our ($revsort, $foldsort, $alphasort);
if($opt{s}) {
for(split "", $opt{s}) {
- /r/ && do { $revsort++; };
- /f/ && do { $foldsort++; $alphasort++; };
- /a/ && do { $alphasort++; };
+ /r/ && do { $revsort++; };
+ /f/ && do { $foldsort++; };
+ /a/ && do { $alphasort++; };
/([^rfa])/ && do {
warn "$SELF: ignoring unknown sort option '$1'\n"; };
}
}
-# Sorry, this is kinda ugly.
+# construct a string of perl code to implement the sort, according
+# to the options given. sorry, this is kinda ugly.
+our ($a, $b, $A, $B);
+$a = $revsort ? '$b' : '$a';
+$b = $revsort ? '$a' : '$b';
+($A, $B) = $foldsort ? ("lc $a", "lc $b") : ($a, $b);
+
+our $sortcode = $A . " cmp " . $B;
if($alphasort) {
- $sortcode = "{ " .
- ($foldsort ? 'lc ' : "" ) .
- ($revsort ? '$b' : '$a') .
- " cmp " .
- ($foldsort ? 'lc ' : "" ) .
- ($revsort ? '$a' : '$b') .
- "}";
+ $sortcode = "{ $sortcode }";
} else {
- $sortcode = "{ " .
- '$counts{' . ($revsort ? '$b' : '$a') . '}' .
- " <=> " .
- '$counts{' . ($revsort ? '$a' : '$b') . '}' .
- "}";
+ $sortcode = "{ (\$counts{$a} <=> \$counts{$b}) || ($sortcode) }";
}
+# the "C" locale causes lots of warnings with 'use open ":locale"'
+# enabled, when reading files with non-ascii characters. Turning
+# on the -B option avoids that, and causes the output to be printed
+# as-is. locale(7) says LC_ALL is checked before LANG, so:
+$opt{B}++ if(($ENV{LC_ALL} || $ENV{LANG}) eq 'C');
+
+# undo the 'use open ":locale"' on STDOUT, for binary mode. This means we
+# can print binary gibberish to a terminal, but that's the user's fault.
+binmode \*STDOUT, ":raw" if $opt{B};
+
# finally done with option processing, let the main event commence.
our %counts = ();
@@ -252,7 +318,7 @@ for(@ARGV) {
};
}
- binmode $fh, ":raw:bytes" if $opt{B};
+ binmode $fh, ":raw" if $opt{B};
$readfiles++;
while(<$fh>) {
@@ -279,7 +345,7 @@ for(@ARGV) {
if($opt{e}) {
no strict;
- no warnings qw/exiting/; # so -e code can "next" to skip a line
+ no warnings qw/exiting/; # so -e code can "next" to skip a record
eval $opt{e};
die "$SELF: $@" if $@;
}
@@ -308,15 +374,15 @@ if($opt{T}) {
if(!$opt{P}) {
for(keys %counts) {
- my $l = length($opt{x} ? hexify($_) : $_);
+ my $l = length(render($_));
$longest = $l if $longest < $l;
}
}
# done reading & counting all input, show the results.
for(sort { eval $sortcode } keys %counts) {
- print ($opt{x} ? hexify($_) : $_);
- print " " x ($longest - length) unless $opt{P};
+ print (my $printable = render($_));
+ print " " x ($longest - length($printable)) unless $opt{P};
print $opt{o} . $counts{$_} unless $opt{p};
printf "$opt{o}%.1f%%", ($counts{$_} * 100 / $total) unless $opt{c};
print "\n";