#!/usr/bin/perl use warnings; no warnings 'surrogate'; ## doesn't work use strict; use open ":locale"; use Getopt::Std; # this makes getopts exit after --help: $Getopt::Std::STANDARD_HELP_VERSION++; (our $SELF = $0) =~ s,.*/,,; our $VERSION="0.0.1"; sub HELP_MESSAGE { print < ... Given the following input: foo foo bar bar baz $SELF will output: bar 2 40.0% baz 1 20.0% foo 2 40.0% The name 'bkt' comes from the concept of collecting like items in buckets (this is basically how a hashtable works). The original plan was to name this script 'bucketize', but who wants to type all that? Also, purely to support lazy typists, $SELF implements subsets of the functionality of cut(1) and sort(1). General options: --help -h display this help message --version display '$SELF $VERSION' -- end of options; everything after this is treated as a filename Output options: -c show counts only (suppress percentages) -p show percentages only (suppress counts) -c and -p may be combined, if you can find a use for it -t show total count -x print output in hexadecimal -s opts output sort options. opts may include: r - reverse sort (default is ascending) a - sort alphabetically (default is by count) f - sort alphabetically, folding case -T thresh filter out results below threshold (which may be a count or a percentage, e.g. 5%). -o string use string as output delimiter (default: \\t). implies -P. -P don't pad output with spaces to length of longest element -o option enables this as well. Input options: -B binary mode (default: input is characters in current locale) -/ sep set value of \$/, perl's input record separator. default is \\n. one of -w -W -n is highly recommended with this option. -b range consider only a range of chars/bytes in each record (e.g. 1-3) -d delim delimiter for -f (default: /\\s+/ aka whitespace) -f field consider only this (delimiter-separated) field -i case insensitive (actually, lowercases all input) -w remove leading and trailing whitespace from input records -W remove ALL whitespace from input records -n remove all non-word (\\W) characters from input records -e code execute perl code for each input record (should modify \$_, make sure you quote the argument as needed by your shell) -k skip blank records -F word frequency count. alias for -ink/' ' Options that don't take arguments may be bundled: -BipW is the same as -B -i -p -W. Input will be read from filenames given on the command line, or from standard input if none given, or if the filename - (hyphen) is given (use ./- to read file a real file named -). The input need not be sorted. The output will always be sorted. Each input record is chomped before any further processing. -b is like the -b or -c option to cut(1) (depending on whether -B is set). It supports the same type of range as cut(1): N N'th byte/character, counted from 1 N- from N'th byte/character to end of record N-M from N'th to M'th (included) byte/character -M from first to M'th (included) byte/character ...plus 2 extra types: -M- from Mth-to-last byte/character to end of record (-1 = last) -M-N from Mth-to-last byte/characters to Nth-to-last ...except that cut allows many ranges separated by commas, while $SELF -b only allows a single range. -d is like the the -d option to cut(1), except that the delimiter can be multiple characters. Also, the delimiter is treated as a regular expression if it's at least 3 characters long *and* enclosed in //. The /i modifier is supported, but none of the other /x regex modifiers are. -f like cut's -f, except that it only allows a single field number (not a list), which is indexed starting from 1 (same as cut)... or a negative number, meaning the Nth field from the right (-1 = rightmost). Also unlike cut, -f and -b may be combined (-b is applied first). The -b -f -i -w -W -n -e

 -k options will be processed in the
order listed here, regardless of the order they're given on the command
line. In particular, this means the code for -e will see \$_ *after*
it's been modified by any of the other options (except -k).

The code for -e will run with strict disabled and warnings enabled. To
disable warnings, prefix the code with 'no warnings;'. There can only
be one -e option, but it may be multiple lines of code separated with
semicolons (like perl's own -e option). When the -e code runs, \$_
contains the input (possibly tranformed by other options), and can
be modified arbitratily. The -e code can filter out unwanted records by
executing "next", which will cause them to be skipped entirely. Also,
if the -k option is used, the code can 'undef \\$_' or assign \\$_=""
to skip the current record.

EOF
}

sub VERSION_MESSAGE {
	print "$SELF $VERSION\n";
}

sub hexify {
	my @out = ();
	push @out, sprintf("%x", ord($_)) for split "", $_[0];
	return join(" ", @out);
}

# main()
our %opt;
getopts('hcpiwWte:d:f:b:xo:Bs:T:P/:nkF', \%opt);

HELP_MESSAGE() if $opt{h};

# -F = -ink/' '
if($opt{F}) {
	$opt{'/'} = ' ';
	$opt{n} = $opt{k} = $opt{i} = 1;
}

# use an eval here so we can handle escapes like \t \n \r
if(defined($opt{'/'})) {
	$opt{'/'} =~ s/'/\\'/g; # to allow -/"'"
	eval "\$/ = '" .  $opt{'/'} . "'";
}

if(defined $opt{o}) {
	$opt{P} = 1;
} else {
	$opt{o} = "\t";
}

if(defined $opt{d}) {
	if($opt{d} =~ m(^/(.+)/(i?)$)) {
		# qr/$1/$2 is a syntax error, so:
		$opt{d} = $2 ? qr/$1/i : qr/$1/;
	} else {
		$opt{d} = quotemeta($opt{d});
		$opt{d} = qr/$opt{d}/;
	}
	if(not defined($opt{f})) {
		warn "$SELF: -d given without -f, which is pointless\n";
	}
} else {
	$opt{d} = qr/\s+/;
}

our $substrarg;
if(defined $opt{b}) {
	for($opt{b}) {
		/^(\d+)$/ && do        { $substrarg = "$1 - 1, 1" };
		/^(\d+)-$/ && do       { $substrarg = "$1 - 1" };
		/^-(\d+)$/ && do       { $substrarg = "0, $1" };
		/^(\d+)-(\d+)$/ && do  { $substrarg = "$1 - 1, " . ($2 - $1 + 1) };
		/^-(\d+)-$/ && do      { $substrarg = "$1"; };
		/^-(\d+)-(\d+)$/ && do { $substrarg = "$1, " . ($2 - $1); };
	}
	die "$SELF: invalid -b argument\n" unless $substrarg;
}

# -f index starts at 1, perl arrays are indexed from 0, fix (but
# don't break using -1 for rightmost field)
$opt{f}-- if defined $opt{f} && $opt{f} > 0;

our ($revsort, $foldsort, $alphasort, $sortcode);

if($opt{s}) {
	for(split "", $opt{s}) {
		/r/ && do { $revsort++;                 };
		/f/ && do { $foldsort++;  $alphasort++; };
		/a/ && do { $alphasort++;               };
		/([^rfa])/ && do {
			warn "$SELF: ignoring unknown sort option '$1'\n"; };
	}
}

# Sorry, this is kinda ugly.
if($alphasort) {
	$sortcode = "{ " .
		($foldsort ? 'lc ' : "" ) .
		($revsort ? '$b' : '$a') .
		" cmp " .
		($foldsort ? 'lc ' : "" ) .
		($revsort ? '$a' : '$b') .
		"}";
} else {
	$sortcode = "{ " .
		'$counts{' . ($revsort ? '$b' : '$a') . '}' .
		" <=> " .
		'$counts{' . ($revsort ? '$a' : '$b') . '}' .
		"}";
}

# finally done with option processing, let the main event commence.

our %counts = ();
our $total = 0;
our $longest = 0;
our $readfiles = 0;
our $badfiles = 0;

# Sadly, we can't use the magical while(<>) here to automatically iterate
# and open all the files in @ARGV, because of the -B option. We need to
# call binmode() on each filehandle after it's opened, but before anything
# gets read from it. 'use open ":bytes"' would set the default binmode,
# but I couldn't get it to work conditionally (not even with eval).

$ARGV[0] = '-' unless @ARGV;
for(@ARGV) {
	my $fh;

	if($_ eq '-') {
		$fh = \*STDIN;
	} else {
		open $fh, '<', $_ or do {
			warn "$SELF: $_: $!\n";
			$badfiles++;
			next;
		};
	}

	binmode $fh, ":bytes" if $opt{B};
	$readfiles++;

	while(<$fh>) {
		chomp;

		# behave like cut for -b/-f: no warnings if -f3 but only 2 fields exist,
		# or -b10 but only 9 characters exist.

		if($substrarg) { # set via $opt{b}
			no warnings qw/substr/;
			eval "\$_ = substr(\$_, $substrarg)";
			$_ = "" unless defined $_;
		}

		if(defined $opt{f}) {
			$_ = (split(/$opt{d}/))[$opt{f}];
			$_ = "" unless defined $_;
		}

		$_ = lc if $opt{i};
		s/^\s+|\s+$//g if $opt{w};
		s/\s//g if $opt{W};
		s/\W//g if $opt{n};

		if($opt{e}) {
			no strict;
			no warnings qw/exiting/; # so -e code can "next" to skip a line
			eval $opt{e};
			die "$SELF: $@" if $@;
		}
		next if $opt{k} && (!defined || length == 0);
		$_ = "" unless defined $_;

		$counts{$_}++;
		$total++;
	}
}

die "$SELF: couldn't read any input files\n" unless $readfiles;

if($opt{T}) {
	(my ($thresh, $pct)) = ($opt{T} =~ /^(\d+)(%?)/);
	if($thresh) {
		for(keys %counts) {
			delete $counts{$_} if
				($pct && (($counts{$_} * 100 / $total) < $thresh)) ||
				(!$pct && ($counts{$_} < $thresh));
		}
	} else {
		die "$SELF: invalid argument for -T\n";
	}
}

if(!$opt{P}) {
	for(keys %counts) {
		my $l = length($opt{x} ? hexify($_) : $_);
		$longest = $l if $longest < $l;
	}
}

# done reading & counting all input, show the results.
for(sort { eval $sortcode } keys %counts) {
	print ($opt{x} ? hexify($_) : $_);
	print " " x ($longest - length) unless $opt{P};
	print $opt{o} . $counts{$_} unless $opt{p};
	printf "$opt{o}%.1f%%", ($counts{$_} * 100 / $total) unless $opt{c};
	print "\n";
}

if($opt{t}) {
	print "\n-- Total count: $total\n";
}

# be like cat, exit with error status if any input file couldn't be
# read (even if we did successfully read others)
exit($badfiles != 0);

__END__

Examples:

# show the percentage of binaries that start with each letter/number/etc,
# 4 different ways
cd /usr/bin
ls | bkt -b1
ls | cut -b1 | bkt
ls | bkt -e '$_=substr($_,0,1)'
ls | bkt -e 's,^(.).*,$1,'

# show percentages of stuff said by each user in an irssi IRC log. relies
# on the log format having a timestamp, space,  for normal lines.
# misses /me actions entirely though.
# add -sr to show the most talkative first.
bkt -f2 -e 'next unless /^\