diff options
Diffstat (limited to 'bsgrep')
| -rwxr-xr-x | bsgrep | 454 |
1 files changed, 454 insertions, 0 deletions
@@ -0,0 +1,454 @@ +#!/usr/bin/perl -w + +$VERSION = "0.0.1"; + +use Getopt::Std; +use File::Find; + +($self = $0) =~ s,.*/,,; + +%printed = (); + +$SIG{__WARN__} = sub { + my $m = shift; + + # don't include the line number in warnings. + $m =~ s/ at \S+ line \d+\.$//; + + # File::Find seems to use double newlines for its warnings.. + $m =~ s/\n\n+/\n/; + + # warnings that don't start with $self: are e.g. file access errors + # from the 'while(<>)' or File::Find. + if($m !~ /^$self:/) { + $m = "$self: $m"; + $ret = 2 unless $opt{q}; + } + + print STDERR $m unless $opt{s}; +}; + +sub grep_options { + my @nargv; + my $was_e; + + # first, grab all the -e options and remove them from @ARGV, + # because Getopt::Std doesn't support multiple occurrences of + # a flag with different args. probably it would be better to + # use Getopt::Long, but for now this works. + for(@ARGV) { + if($was_e) { + push @patterns, $_; + $was_e = 0; + } elsif($_ eq '-e') { + $was_e = 1; + } elsif($_ =~ /-e(.*)/) { + push @patterns, $1; + } else { + push @nargv, $_; + } + } + @ARGV = @nargv; + + getopts('d:FiklnNqrsvwz', \%opt) || exit 1; +} + +sub print_line { + print "$ARGV:" if $filecount > 1; + print "$start_line:" if $opt{n}; + print $_[0]; + print $opt{z} ? "\0" : "\n"; +} + +sub join_options { + getopts('d:knwz', \%opt) || exit 1; +} + +sub handle_line { + my $match = 0; + + for my $pat (@patterns) { + if($opt{v}) { + $match++ if $out !~ /$pat/; + } else { + $match++ if $out =~ /$pat/; + } + } + + return unless $match; + + if($opt{N}) { + return unless $match == @patterns; + } + + $ret = 0 if $ret == 1; + return if $opt{q}; + + if($opt{l}) { + if(!$printed{$ARGV}++) { + print "$ARGV\n"; + } + } else { + print_line($out); + } +} + +### main() +# TODO: do we need 'use locale'? +# also, why does reading iso-8859-1 text auto-convert to utf-8? +for (qw/LANG LC_CTYPE LC_ALL/) { + if(($ENV{$_} // "") =~ /utf-?8/i) { + binmode(\*STDIN, ':utf8'); + binmode(\*STDOUT, ':utf8'); + last; + } +} + +if(defined($ARGV[0])) { + if($ARGV[0] =~ /-help/) { + exec "perldoc $0"; + exit(1); + } elsif($ARGV[0] eq '--man') { + exec "pod2man --stderr -s1 -cUrchlaysStuff -r$VERSION -u $0"; + exit(1); + } elsif($ARGV[0] eq '--version') { + print "bsgrep $VERSION\n"; + exit(0); + } +} + +if($self =~ /join/) { + join_options(); + push @patterns, '^'; # every string has a beginning... +} else { + grep_options(); + + if(!@patterns) { + if(!($patterns[0] = shift)) { + grep_usage(); + die("$self: missing required pattern argument\n"); + } + } + + map { $_ = quotemeta } @patterns if $opt{F}; + map { $_ = "(?i)$_" } @patterns if $opt{i}; +} + +if($opt{r}) { + @ARGV = (".") unless @ARGV; + for(@ARGV) { + if(-d $_) { + find({ + wanted => sub { push @nargv, $_ if -f _; }, + follow => 0, + no_chdir => 1 }, + $_); + } else { + push @nargv, $_; + } + } + + @ARGV = @nargv; +} + +$ret = 1; # return value from main(), set to 0 if anything matched. + +$filecount = @ARGV; # used to decide whether to print filename prefixes. + +$cont = quotemeta($opt{d} // '\\'); + +$/ = "\0" if $opt{z}; + +while(<>) { + chomp; + if(s/\r//) { + if(!$cr_warning) { + warn "$self: $ARGV: stripping carriage returns\n" unless $opt{s}; + $cr_warning = 1; + } + } + if(/$cont\s+$/) { + warn "$self: $ARGV:$.: whitespace after continuation, malformed input?\n" unless $opt{s}; + } + s/^\s+// if $out && $opt{w}; + $start_line = $. unless defined $out; + $out .= $_; + if(/$cont$/) { + if(!$opt{k}) { + $out =~ s/$cont$//; + } + } else { + handle_line(); + undef $out; + } +} continue { + # reset $. on each new file (perldoc -f eof) + if(eof) { + if($out) { + warn "$self: $ARGV:$.: last line ends with continuation\n" unless $opt{s}; + handle_line(); + undef $out; + } + close ARGV; + $cr_warning = 0; + } +} + +exit $ret; + +### rest of file is the docs + +=pod + +=head1 NAME + +bsgrep - search for strings in files with backslash continuation + +bsjoin - join lines with backslash continuation + +=head1 SYNOPSIS + +bsgrep [B<[-FiklnNqrsvwz]> B<-d> I<char> I<...>] [B<-e> I<pattern> ... | I<pattern>] [I<file> I<...>] + +bsjoin [B<[-knwz]> B<-d> I<char> I<...>] [I<file> I<...>] + +=head1 DESCRIPTION + +B<bsgrep> (backslash grep) uses a regular expression to search for +strings in a file, much like B<grep>(1). The main difference is, +B<bsgrep> joins together lines that use the backslash for continuation +(e.g. as B<sh>(1) does). + +Other differences: B<bsgrep> doesn't support the full set of B<grep> +options, and it uses Perl regular expressions rather than POSIX. + +Input is read from one or more files, or standard input if no files +are given. Output goes to standard output. + +The search is done after lines are joined together, so the regex can +match text split across continuation lines. + +If B<bsgrep> is run as B<bsjoin> (via symbolic or hard link, or just +copying the executable), it will simply join together continued lines +without searching for anything. In this mode, only the B<-k>, B<-n>, +B<-w>, B<--version>, and B<--help> options are supported. + +=head1 OPTIONS + +These options work with both B<bsgrep> and B<bsjoin>: + +=over 4 + +=item -d I<char> + +Use I<char> as the continuation character, rather than a backslash. +Actually, there's no law that says it has to be a single character, +if you can think of a use for a string here... though it's treated as +a fixed string, not a regular expression. This option does not exist +in B<grep>. + +=item -k + +Keep the continuation characters when joining continued lines together. +This option does not exist in B<grep>. + +=item -n + +Prefix output lines with line numbers (same as B<grep>). For lines +that are split with continuation characters, the line number will be +that of the first line in the set. Same as B<grep>. + +=item -w + +For continuation lines, remove any leading whitespace. This option is +specific to B<bsgrep>. The B<grep> B<-w> option can be simulated with +the Perl B<\b> syntax in the regex. + +=item -z + +Use zero bytes (ASCII NUL) rather than newlines for line terminators, +for both input and output. Same as B<grep>. + +=item --version + +Print the version of B<bsgrep> and exit. + +=item --help + +Prints this help text, via B<perldoc>(1). + +=item --man + +Prints this help text as a man page, via B<pod2man>(1). Suggested use: + + bsgrep --man > bsgrep.1 + +=back + +These options are only supported by B<bsgrep>: + +=over 4 + +=item -e I<pattern> + +Use I<pattern> as the pattern. May be used multiple times, in which case +they are ORed together (a line that matches any I<pattern> is a match)... unless +the B<-N> option is used, q.v. Same as B<grep>. + +=item -F + +Treat pattern(s) as fixed strings, not regular expression(s). Same as B<grep>. + +=item -i + +Case-insensitive search (same as B<grep>). + +=item -l + +Instead of printing lines that match, print only the names of files +that contain matches (same as B<grep>). + +=item -N + +When multiple patterns are given with multiple B<-e> options, only +select lines that match all of the patterns; the default is to select +lines that match any of the patterns. This option doesn't exist +in B<grep>. + +=item -q + +Quiet: don't write to standard output. Exit status will be zero if +a match was found, even if there were errors. This doesn't prevent +warnings/errors being printed to standard error; use B<-s> to silence +those. Same as B<grep>. + +=item -r + +Recursively read all files under each directory, following symlinks +only if they're on the command line. If no files or directories are +given, reads the current directory. Same as B<grep>. + +=item -s + +Silence warnings (same as B<grep>). This includes error messages +about unreadable files as well as warnings about the input (see +B<DIAGNOSTICS>, below). + +=item -v + +Print only lines that do I<not> match (same as B<grep>). + +=back + +=head1 EXAMPLE + +Given the file B<trs80-roms.info> (which comes from SlackBuilds.org), containing: + + PRGNAM="trs80-roms" + VERSION="20230516" + HOMEPAGE="https://sdltrs.sourceforge.net/docs/index.html" + DOWNLOAD="https://www.filfre.net/misc/trs_roms.zip \ + http://cpmarchives.classiccmp.org/trs80/mirrors/www.discover-net.net/~dmkeil/trs80/files/trs80-62.zip \ + https://www.tim-mann.org/trs80/ld4-631.zip \ + https://archive.org/download/mame-0.250-roms-split_202212/MAME%200.250%20ROMs%20%28split%29/trs80m4p.zip \ + http://www.tim-mann.org/trs80/xtrs-4.9d.tar.gz \ + https://www.classic-computers.org.nz/system-80/disks/NEWDOS_80sssd_jv1.DSK" + MD5SUM="ecd2c47c0624885fbcfb17889241f0ed \ + 9b342f4401801bbc947e303cbeb9902f \ + f2678aa45b76d935a34a0cd2b108925d \ + 8a0f1567df8f166f4056a6a71ef7dce5 \ + 8bb7cf88a3bc1da890f1f29398120bf3 \ + 6f624bdbf4b410cfbe8603fa3bef44fa" + DOWNLOAD_x86_64="" + MD5SUM_x86_64="" + REQUIRES="" + MAINTAINER="B. Watson" + EMAIL="urchlay@slackware.uk" + +We can extract all the download URLs from the file with: + + $ bsgrep '^DOWNLOAD=' trs80-roms + + DOWNLOAD="https://www.filfre.net/misc/trs_roms.zip http://cpmarchives.classiccmp.org/trs80/mirrors/www.discover-net.net/~dmkeil/trs80/files/trs80-62.zip https://www.tim-mann.org/trs80/ld4-631.zip https://archive.org/download/mame-0.250-roms-split_202212/MAME%200.250%20ROMs%20%28split%29/trs80m4p.zip http://www.tim-mann.org/trs80/xtrs-4.9d.tar.gz https://www.classic-computers.org.nz/system-80/disks/NEWDOS_80sssd_jv1.DSK" + DOWNLOAD_x86_64="" + +All the URLs are listed as one long line (apologies for the ugly formatting). +Note that the whitespace that indents the continuation lines is +preserved. In this case, the whitespace is all spaces, but tabs would +be treated the same way. To compress the whitespace into a single space, +use the B<-w> option. + +=head1 DIAGNOSTICS + +Unless disabled with the B<-s> option, B<bsgrep> may print these messages +on standard error: + + bsgrep: <file>: stripping carriage returns + +The input file has MS-DOS/Windows CRLF line endings. B<bsgrep>'s +output will have these removed. Note that other Unix-flavored tools +that understand continuation lines will generally fail when fed CRLF +files. + + bsgrep: <file>, line <line>: whitespace after continuation, malformed input? + +In shell scripts (and most other uses of backslash continuation), a +line that ends with whitespace after the backslash is not treated as a +continuation line. This is a very easy error to create, when manually +editing files. The above warning will help you avoid this. As usual, +it can be ignored if you know exactly what you're doing. + + bsgrep: <file>: last line ends with continuation + +This warning is self-explanatory. There's nothing for the last line +to continue onto, so this is almost certainly an error. + +The above warnings don't affect the exit status. + +=head1 ENVIRONMENT + +B<bsgrep> doesn't define any environment variables of its own, but +it does pay attention to B<LANG>, B<LC_ALL>, and B<LC_CTYPE>. If any +of these contain the string I<UTF-8>, the input and output will be +read/written as Unicode, encoded as UTF-8. If the input turns out not +to be Unicode, it will be assumed ISO-8859-1, and converted to Unicode. + +=head1 EXIT STATUS + +0 if there were any matches, 1 if there were none, or 2 if there +were errors (e.g. nonexistent file). However, with B<-q>, the exit +status will be 0 or 1 even if there were errors. This is the same as +B<grep>'s exit status. + +=head1 LIMITATIONS + +B<bsgrep> doesn't detect binary files like B<grep> does. It can and +will print them to your terminal instead of "binary file matches". + +Not all B<grep> options are supported. Options that aren't implemented +but might be someday include B<--color>, B<-a>, B<-A>, B<-B>, B<-C>, B<-o>. +I don't intend to implement every single option B<grep> has, there are +too many of them. + +There are no long options other than B<--help> and B<--version>. + +B<bsgrep> does not comply with the POSIX (or any other) standard for +B<grep>, and does not intend do. + +Locale support isn't quite the same as B<grep>: in a UTF-8 locale, +if the input isn't plain ASCII or valid UTF-8, it will be treated +as ISO-8859-1, internally converted to Unicode, and output will be +UTF-8. This isn't intended; it's a side-effect of how Perl UTF-8 +filehandles work. In non-UTF-8 locales, things should work as +expected. I hope. + +=head1 AUTHOR + +B<bsgrep> was written by B. Watson <urchlay@slackware.uk> and released +under the WTFPL: Do WTF you want with this. + +=head1 SEE ALSO + +B<grep>(1), B<perl>(1) + +=cut |
