aboutsummaryrefslogtreecommitdiff
path: root/bsgrep
diff options
context:
space:
mode:
Diffstat (limited to 'bsgrep')
-rwxr-xr-xbsgrep454
1 files changed, 454 insertions, 0 deletions
diff --git a/bsgrep b/bsgrep
new file mode 100755
index 0000000..6dd5c43
--- /dev/null
+++ b/bsgrep
@@ -0,0 +1,454 @@
+#!/usr/bin/perl -w
+
+$VERSION = "0.0.1";
+
+use Getopt::Std;
+use File::Find;
+
+($self = $0) =~ s,.*/,,;
+
+%printed = ();
+
+$SIG{__WARN__} = sub {
+ my $m = shift;
+
+ # don't include the line number in warnings.
+ $m =~ s/ at \S+ line \d+\.$//;
+
+ # File::Find seems to use double newlines for its warnings..
+ $m =~ s/\n\n+/\n/;
+
+ # warnings that don't start with $self: are e.g. file access errors
+ # from the 'while(<>)' or File::Find.
+ if($m !~ /^$self:/) {
+ $m = "$self: $m";
+ $ret = 2 unless $opt{q};
+ }
+
+ print STDERR $m unless $opt{s};
+};
+
+sub grep_options {
+ my @nargv;
+ my $was_e;
+
+ # first, grab all the -e options and remove them from @ARGV,
+ # because Getopt::Std doesn't support multiple occurrences of
+ # a flag with different args. probably it would be better to
+ # use Getopt::Long, but for now this works.
+ for(@ARGV) {
+ if($was_e) {
+ push @patterns, $_;
+ $was_e = 0;
+ } elsif($_ eq '-e') {
+ $was_e = 1;
+ } elsif($_ =~ /-e(.*)/) {
+ push @patterns, $1;
+ } else {
+ push @nargv, $_;
+ }
+ }
+ @ARGV = @nargv;
+
+ getopts('d:FiklnNqrsvwz', \%opt) || exit 1;
+}
+
+sub print_line {
+ print "$ARGV:" if $filecount > 1;
+ print "$start_line:" if $opt{n};
+ print $_[0];
+ print $opt{z} ? "\0" : "\n";
+}
+
+sub join_options {
+ getopts('d:knwz', \%opt) || exit 1;
+}
+
+sub handle_line {
+ my $match = 0;
+
+ for my $pat (@patterns) {
+ if($opt{v}) {
+ $match++ if $out !~ /$pat/;
+ } else {
+ $match++ if $out =~ /$pat/;
+ }
+ }
+
+ return unless $match;
+
+ if($opt{N}) {
+ return unless $match == @patterns;
+ }
+
+ $ret = 0 if $ret == 1;
+ return if $opt{q};
+
+ if($opt{l}) {
+ if(!$printed{$ARGV}++) {
+ print "$ARGV\n";
+ }
+ } else {
+ print_line($out);
+ }
+}
+
+### main()
+# TODO: do we need 'use locale'?
+# also, why does reading iso-8859-1 text auto-convert to utf-8?
+for (qw/LANG LC_CTYPE LC_ALL/) {
+ if(($ENV{$_} // "") =~ /utf-?8/i) {
+ binmode(\*STDIN, ':utf8');
+ binmode(\*STDOUT, ':utf8');
+ last;
+ }
+}
+
+if(defined($ARGV[0])) {
+ if($ARGV[0] =~ /-help/) {
+ exec "perldoc $0";
+ exit(1);
+ } elsif($ARGV[0] eq '--man') {
+ exec "pod2man --stderr -s1 -cUrchlaysStuff -r$VERSION -u $0";
+ exit(1);
+ } elsif($ARGV[0] eq '--version') {
+ print "bsgrep $VERSION\n";
+ exit(0);
+ }
+}
+
+if($self =~ /join/) {
+ join_options();
+ push @patterns, '^'; # every string has a beginning...
+} else {
+ grep_options();
+
+ if(!@patterns) {
+ if(!($patterns[0] = shift)) {
+ grep_usage();
+ die("$self: missing required pattern argument\n");
+ }
+ }
+
+ map { $_ = quotemeta } @patterns if $opt{F};
+ map { $_ = "(?i)$_" } @patterns if $opt{i};
+}
+
+if($opt{r}) {
+ @ARGV = (".") unless @ARGV;
+ for(@ARGV) {
+ if(-d $_) {
+ find({
+ wanted => sub { push @nargv, $_ if -f _; },
+ follow => 0,
+ no_chdir => 1 },
+ $_);
+ } else {
+ push @nargv, $_;
+ }
+ }
+
+ @ARGV = @nargv;
+}
+
+$ret = 1; # return value from main(), set to 0 if anything matched.
+
+$filecount = @ARGV; # used to decide whether to print filename prefixes.
+
+$cont = quotemeta($opt{d} // '\\');
+
+$/ = "\0" if $opt{z};
+
+while(<>) {
+ chomp;
+ if(s/\r//) {
+ if(!$cr_warning) {
+ warn "$self: $ARGV: stripping carriage returns\n" unless $opt{s};
+ $cr_warning = 1;
+ }
+ }
+ if(/$cont\s+$/) {
+ warn "$self: $ARGV:$.: whitespace after continuation, malformed input?\n" unless $opt{s};
+ }
+ s/^\s+// if $out && $opt{w};
+ $start_line = $. unless defined $out;
+ $out .= $_;
+ if(/$cont$/) {
+ if(!$opt{k}) {
+ $out =~ s/$cont$//;
+ }
+ } else {
+ handle_line();
+ undef $out;
+ }
+} continue {
+ # reset $. on each new file (perldoc -f eof)
+ if(eof) {
+ if($out) {
+ warn "$self: $ARGV:$.: last line ends with continuation\n" unless $opt{s};
+ handle_line();
+ undef $out;
+ }
+ close ARGV;
+ $cr_warning = 0;
+ }
+}
+
+exit $ret;
+
+### rest of file is the docs
+
+=pod
+
+=head1 NAME
+
+bsgrep - search for strings in files with backslash continuation
+
+bsjoin - join lines with backslash continuation
+
+=head1 SYNOPSIS
+
+bsgrep [B<[-FiklnNqrsvwz]> B<-d> I<char> I<...>] [B<-e> I<pattern> ... | I<pattern>] [I<file> I<...>]
+
+bsjoin [B<[-knwz]> B<-d> I<char> I<...>] [I<file> I<...>]
+
+=head1 DESCRIPTION
+
+B<bsgrep> (backslash grep) uses a regular expression to search for
+strings in a file, much like B<grep>(1). The main difference is,
+B<bsgrep> joins together lines that use the backslash for continuation
+(e.g. as B<sh>(1) does).
+
+Other differences: B<bsgrep> doesn't support the full set of B<grep>
+options, and it uses Perl regular expressions rather than POSIX.
+
+Input is read from one or more files, or standard input if no files
+are given. Output goes to standard output.
+
+The search is done after lines are joined together, so the regex can
+match text split across continuation lines.
+
+If B<bsgrep> is run as B<bsjoin> (via symbolic or hard link, or just
+copying the executable), it will simply join together continued lines
+without searching for anything. In this mode, only the B<-k>, B<-n>,
+B<-w>, B<--version>, and B<--help> options are supported.
+
+=head1 OPTIONS
+
+These options work with both B<bsgrep> and B<bsjoin>:
+
+=over 4
+
+=item -d I<char>
+
+Use I<char> as the continuation character, rather than a backslash.
+Actually, there's no law that says it has to be a single character,
+if you can think of a use for a string here... though it's treated as
+a fixed string, not a regular expression. This option does not exist
+in B<grep>.
+
+=item -k
+
+Keep the continuation characters when joining continued lines together.
+This option does not exist in B<grep>.
+
+=item -n
+
+Prefix output lines with line numbers (same as B<grep>). For lines
+that are split with continuation characters, the line number will be
+that of the first line in the set. Same as B<grep>.
+
+=item -w
+
+For continuation lines, remove any leading whitespace. This option is
+specific to B<bsgrep>. The B<grep> B<-w> option can be simulated with
+the Perl B<\b> syntax in the regex.
+
+=item -z
+
+Use zero bytes (ASCII NUL) rather than newlines for line terminators,
+for both input and output. Same as B<grep>.
+
+=item --version
+
+Print the version of B<bsgrep> and exit.
+
+=item --help
+
+Prints this help text, via B<perldoc>(1).
+
+=item --man
+
+Prints this help text as a man page, via B<pod2man>(1). Suggested use:
+
+ bsgrep --man > bsgrep.1
+
+=back
+
+These options are only supported by B<bsgrep>:
+
+=over 4
+
+=item -e I<pattern>
+
+Use I<pattern> as the pattern. May be used multiple times, in which case
+they are ORed together (a line that matches any I<pattern> is a match)... unless
+the B<-N> option is used, q.v. Same as B<grep>.
+
+=item -F
+
+Treat pattern(s) as fixed strings, not regular expression(s). Same as B<grep>.
+
+=item -i
+
+Case-insensitive search (same as B<grep>).
+
+=item -l
+
+Instead of printing lines that match, print only the names of files
+that contain matches (same as B<grep>).
+
+=item -N
+
+When multiple patterns are given with multiple B<-e> options, only
+select lines that match all of the patterns; the default is to select
+lines that match any of the patterns. This option doesn't exist
+in B<grep>.
+
+=item -q
+
+Quiet: don't write to standard output. Exit status will be zero if
+a match was found, even if there were errors. This doesn't prevent
+warnings/errors being printed to standard error; use B<-s> to silence
+those. Same as B<grep>.
+
+=item -r
+
+Recursively read all files under each directory, following symlinks
+only if they're on the command line. If no files or directories are
+given, reads the current directory. Same as B<grep>.
+
+=item -s
+
+Silence warnings (same as B<grep>). This includes error messages
+about unreadable files as well as warnings about the input (see
+B<DIAGNOSTICS>, below).
+
+=item -v
+
+Print only lines that do I<not> match (same as B<grep>).
+
+=back
+
+=head1 EXAMPLE
+
+Given the file B<trs80-roms.info> (which comes from SlackBuilds.org), containing:
+
+ PRGNAM="trs80-roms"
+ VERSION="20230516"
+ HOMEPAGE="https://sdltrs.sourceforge.net/docs/index.html"
+ DOWNLOAD="https://www.filfre.net/misc/trs_roms.zip \
+ http://cpmarchives.classiccmp.org/trs80/mirrors/www.discover-net.net/~dmkeil/trs80/files/trs80-62.zip \
+ https://www.tim-mann.org/trs80/ld4-631.zip \
+ https://archive.org/download/mame-0.250-roms-split_202212/MAME%200.250%20ROMs%20%28split%29/trs80m4p.zip \
+ http://www.tim-mann.org/trs80/xtrs-4.9d.tar.gz \
+ https://www.classic-computers.org.nz/system-80/disks/NEWDOS_80sssd_jv1.DSK"
+ MD5SUM="ecd2c47c0624885fbcfb17889241f0ed \
+ 9b342f4401801bbc947e303cbeb9902f \
+ f2678aa45b76d935a34a0cd2b108925d \
+ 8a0f1567df8f166f4056a6a71ef7dce5 \
+ 8bb7cf88a3bc1da890f1f29398120bf3 \
+ 6f624bdbf4b410cfbe8603fa3bef44fa"
+ DOWNLOAD_x86_64=""
+ MD5SUM_x86_64=""
+ REQUIRES=""
+ MAINTAINER="B. Watson"
+ EMAIL="urchlay@slackware.uk"
+
+We can extract all the download URLs from the file with:
+
+ $ bsgrep '^DOWNLOAD=' trs80-roms
+
+ DOWNLOAD="https://www.filfre.net/misc/trs_roms.zip http://cpmarchives.classiccmp.org/trs80/mirrors/www.discover-net.net/~dmkeil/trs80/files/trs80-62.zip https://www.tim-mann.org/trs80/ld4-631.zip https://archive.org/download/mame-0.250-roms-split_202212/MAME%200.250%20ROMs%20%28split%29/trs80m4p.zip http://www.tim-mann.org/trs80/xtrs-4.9d.tar.gz https://www.classic-computers.org.nz/system-80/disks/NEWDOS_80sssd_jv1.DSK"
+ DOWNLOAD_x86_64=""
+
+All the URLs are listed as one long line (apologies for the ugly formatting).
+Note that the whitespace that indents the continuation lines is
+preserved. In this case, the whitespace is all spaces, but tabs would
+be treated the same way. To compress the whitespace into a single space,
+use the B<-w> option.
+
+=head1 DIAGNOSTICS
+
+Unless disabled with the B<-s> option, B<bsgrep> may print these messages
+on standard error:
+
+ bsgrep: <file>: stripping carriage returns
+
+The input file has MS-DOS/Windows CRLF line endings. B<bsgrep>'s
+output will have these removed. Note that other Unix-flavored tools
+that understand continuation lines will generally fail when fed CRLF
+files.
+
+ bsgrep: <file>, line <line>: whitespace after continuation, malformed input?
+
+In shell scripts (and most other uses of backslash continuation), a
+line that ends with whitespace after the backslash is not treated as a
+continuation line. This is a very easy error to create, when manually
+editing files. The above warning will help you avoid this. As usual,
+it can be ignored if you know exactly what you're doing.
+
+ bsgrep: <file>: last line ends with continuation
+
+This warning is self-explanatory. There's nothing for the last line
+to continue onto, so this is almost certainly an error.
+
+The above warnings don't affect the exit status.
+
+=head1 ENVIRONMENT
+
+B<bsgrep> doesn't define any environment variables of its own, but
+it does pay attention to B<LANG>, B<LC_ALL>, and B<LC_CTYPE>. If any
+of these contain the string I<UTF-8>, the input and output will be
+read/written as Unicode, encoded as UTF-8. If the input turns out not
+to be Unicode, it will be assumed ISO-8859-1, and converted to Unicode.
+
+=head1 EXIT STATUS
+
+0 if there were any matches, 1 if there were none, or 2 if there
+were errors (e.g. nonexistent file). However, with B<-q>, the exit
+status will be 0 or 1 even if there were errors. This is the same as
+B<grep>'s exit status.
+
+=head1 LIMITATIONS
+
+B<bsgrep> doesn't detect binary files like B<grep> does. It can and
+will print them to your terminal instead of "binary file matches".
+
+Not all B<grep> options are supported. Options that aren't implemented
+but might be someday include B<--color>, B<-a>, B<-A>, B<-B>, B<-C>, B<-o>.
+I don't intend to implement every single option B<grep> has, there are
+too many of them.
+
+There are no long options other than B<--help> and B<--version>.
+
+B<bsgrep> does not comply with the POSIX (or any other) standard for
+B<grep>, and does not intend do.
+
+Locale support isn't quite the same as B<grep>: in a UTF-8 locale,
+if the input isn't plain ASCII or valid UTF-8, it will be treated
+as ISO-8859-1, internally converted to Unicode, and output will be
+UTF-8. This isn't intended; it's a side-effect of how Perl UTF-8
+filehandles work. In non-UTF-8 locales, things should work as
+expected. I hope.
+
+=head1 AUTHOR
+
+B<bsgrep> was written by B. Watson <urchlay@slackware.uk> and released
+under the WTFPL: Do WTF you want with this.
+
+=head1 SEE ALSO
+
+B<grep>(1), B<perl>(1)
+
+=cut