diff options
-rwxr-xr-x | bkt | 187 |
1 files changed, 121 insertions, 66 deletions
@@ -1,17 +1,24 @@ #!/usr/bin/perl use warnings; -use Getopt::Std; +no warnings 'surrogate'; ## doesn't work +use strict; + use open ":locale"; + +use Getopt::Std; +# this makes getopts exit after --help: $Getopt::Std::STANDARD_HELP_VERSION++; -($SELF = $0) =~ s,.*/,,; -$VERSION="0.0.1"; +(our $SELF = $0) =~ s,.*/,,; +our $VERSION="0.0.1"; sub HELP_MESSAGE { print <<EOF; $SELF - count repeats in input +Usage: $SELF <options> <file> ... + Given the following input: foo @@ -26,75 +33,101 @@ bar 2 40.0% baz 1 20.0% foo 2 40.0% -Usage: $SELF <options> <file> ... +The name 'bkt' comes from the concept of collecting like items in buckets +(this is basically how a hashtable works). The original plan was to name +this script 'bucketize', but who wants to type all that? Also, purely +to support lazy typists, $SELF implements subsets of the functionality +of cut(1) and sort(1). General options: --help -h display this help message + --version display '$SELF $VERSION' + -- end of options; everything after this is treated as a filename Output options: -c show counts only (suppress percentages) -p show percentages only (suppress counts) + -c and -p may be combined, if you can find a use for it -t show total count -x print output in hexadecimal -s opts output sort options. opts may include: - r - reverse sort - f - fold case - c - sort by count rather than input string + r - reverse sort (default is ascending) + a - sort alphabetically (default is by count) + f - sort alphabetically, folding case -T thresh filter out results below threshold (which may be a count or a percentage, e.g. 5%). + -o string use string as output delimiter (default: \\t). implies -P. -P don't pad output with spaces to length of longest element -o option enables this as well. Input options: -B binary mode (default: input is characters in current locale) - -o string use string as output delimiter (default: \\t). implies -P. - -b range consider only a range of chars/bytes in each line (e.g. 1-3) - -d delim split on delimiter (default: /\\s+/ aka whitespace) + -/ sep set value of \$/, perl's input record separator. default is \\n. + one of -w -W -n is highly recommended with this option. + -b range consider only a range of chars/bytes in each record (e.g. 1-3) + -d delim delimiter for -f (default: /\\s+/ aka whitespace) -f field consider only this (delimiter-separated) field -i case insensitive (actually, lowercases all input) - -w remove leading and trailing whitespace from input lines - -W remove ALL whitespace from input lines - -e code execute perl code for each line of input (should modify \$_, + -w remove leading and trailing whitespace from input records + -W remove ALL whitespace from input records + -n remove all non-word (\\W) characters from input records + -e code execute perl code for each input record (should modify \$_, make sure you quote the argument as needed by your shell) + -k skip blank records + -F word frequency count. alias for -ink/' ' + +Options that don't take arguments may be bundled: -BipW is the same as +-B -i -p -W. Input will be read from filenames given on the command line, or from -standard input if none given. The input need not be sorted. The output -will always be sorted. +standard input if none given, or if the filename - (hyphen) is given (use +./- to read file a real file named -). The input need not be sorted. The +output will always be sorted. -Each line of input is chomped (has trailing \\n removed) before any -further processing. +Each input record is chomped before any further processing. -b is like the -b or -c option to cut(1) (depending on whether -B is set). It supports the same type of range as cut(1): N N'th byte/character, counted from 1 -N- from N'th byte/character to end of line +N- from N'th byte/character to end of record N-M from N'th to M'th (included) byte/character -M from first to M'th (included) byte/character +...plus 2 extra types: + +-M- from Mth-to-last byte/character to end of record (-1 = last) +-M-N from Mth-to-last byte/characters to Nth-to-last + ...except that cut allows many ranges separated by commas, while $SELF -b only allows a single range. --d is like the the -d option to cut(1), except that the delimiter can be -multiple characters. Also, the delimiter is treated as a regular expression -if it's at least 4 characters long *and* enclosed in //. +-d is like the the -d option to cut(1), except that the delimiter can +be multiple characters. Also, the delimiter is treated as a regular +expression if it's at least 3 characters long *and* enclosed in //. The +/i modifier is supported, but none of the other /x regex modifiers are. -f like cut's -f, except that it only allows a single field number (not a list), which is indexed starting from 1 (same as cut)... or a negative number, meaning the Nth field from the right (-1 = rightmost). Also unlike cut, -f and -b may be combined (-b is applied first). -The -b -f -i -w -W -e<code> options will be processed in the order -listed here, regardless of the order they're given on the command +The -b -f -i -w -W -n -e<code> -k options will be processed in the +order listed here, regardless of the order they're given on the command line. In particular, this means the code for -e will see \$_ *after* -it's been modified by any of the other options. +it's been modified by any of the other options (except -k). + +The code for -e will run with strict disabled and warnings enabled. To +disable warnings, prefix the code with 'no warnings;'. There can only +be one -e option, but it may be multiple lines of code separated with +semicolons (like perl's own -e option). When the -e code runs, \$_ +contains the input (possibly tranformed by other options), and can +be modified arbitratily. The -e code can filter out unwanted records by +executing "next", which will cause them to be skipped entirely. Also, +if the -k option is used, the code can 'undef \\$_' or assign \\$_="" +to skip the current record. -There can only be one -e option, but it may be multiple lines of code -separated with semicolons (like perl's own -e option). When the -e code -runs, \$_ contains the input (possibly tranformed by other options), -and can be modified arbitratily. The -e code can filter out unwanted -lines by executing "next", which will cause them to be skipped entirely. EOF } @@ -109,12 +142,22 @@ sub hexify { } # main() -getopts('hcpiwWte:d:f:b:xo:Bs:T:P', \%opt); +our %opt; +getopts('hcpiwWte:d:f:b:xo:Bs:T:P/:nkF', \%opt); HELP_MESSAGE() if $opt{h}; -die "$SELF: options -c and -p are mutually exclusive\n" - if $opt{c} && $opt{p}; +# -F = -ink/' ' +if($opt{F}) { + $opt{'/'} = ' '; + $opt{n} = $opt{k} = $opt{i} = 1; +} + +# use an eval here so we can handle escapes like \t \n \r +if(defined($opt{'/'})) { + $opt{'/'} =~ s/'/\\'/g; # to allow -/"'" + eval "\$/ = '" . $opt{'/'} . "'"; +} if(defined $opt{o}) { $opt{P} = 1; @@ -123,22 +166,29 @@ if(defined $opt{o}) { } if(defined $opt{d}) { - if($opt{d} =~ m|^/(.{2,})/$|) { - $opt{d} = qr/$1/; + if($opt{d} =~ m(^/(.+)/(i?)$)) { + # qr/$1/$2 is a syntax error, so: + $opt{d} = $2 ? qr/$1/i : qr/$1/; } else { $opt{d} = quotemeta($opt{d}); $opt{d} = qr/$opt{d}/; } + if(not defined($opt{f})) { + warn "$SELF: -d given without -f, which is pointless\n"; + } } else { $opt{d} = qr/\s+/; } +our $substrarg; if(defined $opt{b}) { for($opt{b}) { - /^(\d+)$/ && do { $substrarg = "$1 - 1, 1" }; - /^(\d+)-$/ && do { $substrarg = "$1 - 1" }; - /^-(\d+)$/ && do { $substrarg = "0, $1" }; - /^(\d+)-(\d+)$/ && do { $substrarg = "$1 - 1, " . ($2 - $1 + 1) }; + /^(\d+)$/ && do { $substrarg = "$1 - 1, 1" }; + /^(\d+)-$/ && do { $substrarg = "$1 - 1" }; + /^-(\d+)$/ && do { $substrarg = "0, $1" }; + /^(\d+)-(\d+)$/ && do { $substrarg = "$1 - 1, " . ($2 - $1 + 1) }; + /^-(\d+)-$/ && do { $substrarg = "$1"; }; + /^-(\d+)-(\d+)$/ && do { $substrarg = "$1, " . ($2 - $1); }; } die "$SELF: invalid -b argument\n" unless $substrarg; } @@ -147,27 +197,20 @@ if(defined $opt{b}) { # don't break using -1 for rightmost field) $opt{f}-- if defined $opt{f} && $opt{f} > 0; +our ($revsort, $foldsort, $alphasort, $sortcode); + if($opt{s}) { for(split "", $opt{s}) { - /r/ && do { $revsort++; }; - /f/ && do { $foldsort++; }; - /c/ && do { $countsort++; }; - /([^rfc])/ && do { + /r/ && do { $revsort++; }; + /f/ && do { $foldsort++; $alphasort++; }; + /a/ && do { $alphasort++; }; + /([^rfa])/ && do { warn "$SELF: ignoring unknown sort option '$1'\n"; }; } } -if($countsort && $foldsort) { - die "$SELF: sort options 'c' and 'f' are mutually exclusive\n"; -} - -if($countsort) { - $sortcode = "{ " . - '$counts{' . ($revsort ? '$b' : '$a') . '}' . - " <=> " . - '$counts{' . ($revsort ? '$a' : '$b') . '}' . - "}"; -} else { +# Sorry, this is kinda ugly. +if($alphasort) { $sortcode = "{ " . ($foldsort ? 'lc ' : "" ) . ($revsort ? '$b' : '$a') . @@ -175,18 +218,24 @@ if($countsort) { ($foldsort ? 'lc ' : "" ) . ($revsort ? '$a' : '$b') . "}"; +} else { + $sortcode = "{ " . + '$counts{' . ($revsort ? '$b' : '$a') . '}' . + " <=> " . + '$counts{' . ($revsort ? '$a' : '$b') . '}' . + "}"; } # finally done with option processing, let the main event commence. -%counts = (); -$total = 0; -$longest = 0; -$readfiles = 0; -$badfiles = 0; +our %counts = (); +our $total = 0; +our $longest = 0; +our $readfiles = 0; +our $badfiles = 0; # Sadly, we can't use the magical while(<>) here to automatically iterate -# and open all the files in @ARGV, because of the locale stuff. We need to +# and open all the files in @ARGV, because of the -B option. We need to # call binmode() on each filehandle after it's opened, but before anything # gets read from it. 'use open ":bytes"' would set the default binmode, # but I couldn't get it to work conditionally (not even with eval). @@ -220,20 +269,24 @@ for(@ARGV) { $_ = "" unless defined $_; } - if($opt{f}) { - $_ = (split($opt{d}))[$opt{f}]; + if(defined $opt{f}) { + $_ = (split(/$opt{d}/))[$opt{f}]; $_ = "" unless defined $_; } $_ = lc if $opt{i}; s/^\s+|\s+$//g if $opt{w}; s/\s//g if $opt{W}; + s/\W//g if $opt{n}; if($opt{e}) { + no strict; no warnings qw/exiting/; # so -e code can "next" to skip a line eval $opt{e}; die "$SELF: $@" if $@; } + next if $opt{k} && (!defined || length == 0); + $_ = "" unless defined $_; $counts{$_}++; $total++; @@ -251,7 +304,7 @@ if($opt{T}) { (!$pct && ($counts{$_} < $thresh)); } } else { - warn "$SELF: invalid argument for -T\n"; + die "$SELF: invalid argument for -T\n"; } } @@ -266,9 +319,8 @@ if(!$opt{P}) { for(sort { eval $sortcode } keys %counts) { print ($opt{x} ? hexify($_) : $_); print " " x ($longest - length) unless $opt{P}; - print $opt{o}; - print $counts{$_} . $opt{o} unless $opt{p}; - printf "%.1f%%", ($counts{$_} * 100 / $total) unless $opt{c}; + print $opt{o} . $counts{$_} unless $opt{p}; + printf "$opt{o}%.1f%%", ($counts{$_} * 100 / $total) unless $opt{c}; print "\n"; } @@ -295,7 +347,7 @@ ls | bkt -e 's,^(.).*,$1,' # show percentages of stuff said by each user in an irssi IRC log. relies # on the log format having a timestamp, space, <nick> for normal lines. # misses /me actions entirely though. -# add -src to show the most talkative first. +# add -sr to show the most talkative first. bkt -f2 -e 'next unless /^\</' channelname.log # show us how many users use each shell (including stuff like /bin/false). @@ -304,3 +356,6 @@ bkt -d: -f-1 /etc/passwd # how many images of each type have we got? ignore case, so JPG and jpg # are counted together. ls ~/images/*.* | bkt -d. -f-1 + +# What percentage of words in a text file are capitalized? +bkt -n/' ' -e's/^[A-Z]+$/CAPS/ || s/^[A-Z].*$/Caps/ || s/^[a-z].*$/lower/ || next' file.txt |