diff options
author | B. Watson <yalhcru@gmail.com> | 2015-10-12 22:46:42 -0400 |
---|---|---|
committer | B. Watson <yalhcru@gmail.com> | 2015-10-12 22:46:42 -0400 |
commit | c76bc179d5cdecc6a73a88f8ee47ff474e098e79 (patch) | |
tree | 0126213ab401fd10fc7bb5b617f4f02e32c7542c /bkt | |
parent | 440229f91b4eee243b7de550fa70e8a3b3f64600 (diff) | |
download | misc-scripts-c76bc179d5cdecc6a73a88f8ee47ff474e098e79.tar.gz |
enable -B when LANG=C, add -a option
Diffstat (limited to 'bkt')
-rwxr-xr-x | bkt | 124 |
1 files changed, 95 insertions, 29 deletions
@@ -1,8 +1,11 @@ #!/usr/bin/perl +# by popular demand: use warnings; use strict; +# I wish there were a way to do this conditionally. +# no, this didn't work: require 'open.pm'; ::open->import(':locale', ':std'); use open ":locale", ":std"; use Getopt::Std; @@ -49,6 +52,7 @@ Output options: -c and -p may be combined, if you can find a use for it -t show total count -x print output in hexadecimal + -a ASCII output: render non-ASCII characters as hex escapes -s opts output sort options. opts may include: r - reverse sort (default is ascending) a - sort alphabetically (default is by count) @@ -61,6 +65,7 @@ Output options: Input options: -B binary mode (default: input is characters in current locale) + -r int read input as fixed-size records (can't combine with -/) -/ sep set value of \$/, perl's input record separator. default is \\n. one of -w -W -n is highly recommended with this option. -b range consider only a range of chars/bytes in each record (e.g. 1-3) @@ -74,6 +79,7 @@ Input options: make sure you quote the argument as needed by your shell) -k skip blank records -F word frequency count. alias for -ink/' ' + -L letter frequency count. alias for -inkr1 Options that don't take arguments may be bundled: -BipW is the same as -B -i -p -W. @@ -139,11 +145,40 @@ sub hexify { return join(" ", @out); } +# this bit of 300 baud linenoise replaces all non-ascii characters +# with {\x00} style hex escapes. best viewed with a colorful syntax +# highlighting editor (I like vim). +sub asciify { + my $in = shift; + + $in =~ + s| + ( # capture to $1... + [^\x20-\x7e] # anything outside the range 0x20 - 0x7e + ) + | # replace with {\xXX}: + "{\\x" . + sprintf("%02x", ord($1)) . + "}" + |gex; + + return $in; +} + +# sub render, sub render, but don't give yourself awaaaayyy +sub render { + our %opt; + my $in = shift; + return hexify($in) if $opt{x}; + return asciify($in) if $opt{a}; + return $in; +} + # main() -our %opt; -getopts('hcpiwWte:d:f:b:xo:Bs:T:P/:nkF', \%opt); +getopts('hcpiwWte:d:f:b:xao:Bs:T:P/:nkFr:L', \our %opt); -HELP_MESSAGE() if $opt{h}; +# -h == --help +HELP_MESSAGE(), exit(0) if $opt{h}; # -F = -ink/' ' if($opt{F}) { @@ -151,18 +186,41 @@ if($opt{F}) { $opt{n} = $opt{k} = $opt{i} = 1; } -# use an eval here so we can handle escapes like \t \n \r +# -L = -inkr1 +if($opt{L}) { + $opt{r} = 1; + $opt{n} = $opt{k} = $opt{i} = 1; +} + + +# use an eval for -/ so we can handle escapes like \t \n \r if(defined($opt{'/'})) { - $opt{'/'} =~ s/'/\\'/g; # to allow -/"'" - eval "\$/ = '" . $opt{'/'} . "'"; + if(defined($opt{r})) { + warn "$SELF: -r and -/ given; -/ ignored\n"; + } else { + $opt{'/'} =~ s/'/\\'/g; # to allow -/"'" + eval "\$/ = '" . $opt{'/'} . "'"; + } +} + +# -r also uses $/ +if(defined($opt{r})) { + die "$SELF: -r argument must be positive integer\n" + unless $opt{r} =~ /^\d+$/; + $/ = \$opt{r}; } +# -o implies -P if(defined $opt{o}) { $opt{P} = 1; } else { $opt{o} = "\t"; } +# -cp implies -P too +$opt{P}++ if $opt{c} && $opt{p}; + +# handle -d arg. we only support the /i modifier when -d/regex/. if(defined $opt{d}) { if($opt{d} =~ m(^/(.+)/(i?)$)) { # qr/$1/$2 is a syntax error, so: @@ -178,6 +236,7 @@ if(defined $opt{d}) { $opt{d} = qr/\s+/; } +# handle -b arg our $substrarg; if(defined $opt{b}) { for($opt{b}) { @@ -195,35 +254,42 @@ if(defined $opt{b}) { # don't break using -1 for rightmost field) $opt{f}-- if defined $opt{f} && $opt{f} > 0; -our ($revsort, $foldsort, $alphasort, $sortcode); - +# handle the various -s sub-options +our ($revsort, $foldsort, $alphasort); if($opt{s}) { for(split "", $opt{s}) { - /r/ && do { $revsort++; }; - /f/ && do { $foldsort++; $alphasort++; }; - /a/ && do { $alphasort++; }; + /r/ && do { $revsort++; }; + /f/ && do { $foldsort++; }; + /a/ && do { $alphasort++; }; /([^rfa])/ && do { warn "$SELF: ignoring unknown sort option '$1'\n"; }; } } -# Sorry, this is kinda ugly. +# construct a string of perl code to implement the sort, according +# to the options given. sorry, this is kinda ugly. +our ($a, $b, $A, $B); +$a = $revsort ? '$b' : '$a'; +$b = $revsort ? '$a' : '$b'; +($A, $B) = $foldsort ? ("lc $a", "lc $b") : ($a, $b); + +our $sortcode = $A . " cmp " . $B; if($alphasort) { - $sortcode = "{ " . - ($foldsort ? 'lc ' : "" ) . - ($revsort ? '$b' : '$a') . - " cmp " . - ($foldsort ? 'lc ' : "" ) . - ($revsort ? '$a' : '$b') . - "}"; + $sortcode = "{ $sortcode }"; } else { - $sortcode = "{ " . - '$counts{' . ($revsort ? '$b' : '$a') . '}' . - " <=> " . - '$counts{' . ($revsort ? '$a' : '$b') . '}' . - "}"; + $sortcode = "{ (\$counts{$a} <=> \$counts{$b}) || ($sortcode) }"; } +# the "C" locale causes lots of warnings with 'use open ":locale"' +# enabled, when reading files with non-ascii characters. Turning +# on the -B option avoids that, and causes the output to be printed +# as-is. locale(7) says LC_ALL is checked before LANG, so: +$opt{B}++ if(($ENV{LC_ALL} || $ENV{LANG}) eq 'C'); + +# undo the 'use open ":locale"' on STDOUT, for binary mode. This means we +# can print binary gibberish to a terminal, but that's the user's fault. +binmode \*STDOUT, ":raw" if $opt{B}; + # finally done with option processing, let the main event commence. our %counts = (); @@ -252,7 +318,7 @@ for(@ARGV) { }; } - binmode $fh, ":raw:bytes" if $opt{B}; + binmode $fh, ":raw" if $opt{B}; $readfiles++; while(<$fh>) { @@ -279,7 +345,7 @@ for(@ARGV) { if($opt{e}) { no strict; - no warnings qw/exiting/; # so -e code can "next" to skip a line + no warnings qw/exiting/; # so -e code can "next" to skip a record eval $opt{e}; die "$SELF: $@" if $@; } @@ -308,15 +374,15 @@ if($opt{T}) { if(!$opt{P}) { for(keys %counts) { - my $l = length($opt{x} ? hexify($_) : $_); + my $l = length(render($_)); $longest = $l if $longest < $l; } } # done reading & counting all input, show the results. for(sort { eval $sortcode } keys %counts) { - print ($opt{x} ? hexify($_) : $_); - print " " x ($longest - length) unless $opt{P}; + print (my $printable = render($_)); + print " " x ($longest - length($printable)) unless $opt{P}; print $opt{o} . $counts{$_} unless $opt{p}; printf "$opt{o}%.1f%%", ($counts{$_} * 100 / $total) unless $opt{c}; print "\n"; |