add unfb2: convert fictionbook to text or html

author: B. Watson <yalhcru@gmail.com> 2020-05-28 01:13:23 -0400
committer: B. Watson <yalhcru@gmail.com> 2020-05-28 01:13:23 -0400
commit: 722e9a7b6816c960ff1e02b352dfb909983e8f1b (patch)
tree: d27364345968c00696139d2ccdcbc8f14d34722b
parent: 923f8f45a32edef8b812948f9fe15b84634575c5 (diff)
download: misc-scripts-722e9a7b6816c960ff1e02b352dfb909983e8f1b.tar.gz
1 files changed, 197 insertions, 0 deletions
diff --git a/unfb2 b/unfb2
new file mode 100755
index 0000000..6406b3a
--- /dev/null
+++ b/unfb2
@@ -0,0 +1,197 @@
+#!/usr/bin/perl -w
+
+=pod
+
+=head1 NAME
+
+unfb2 - convert FB2 (FictionBook) files to HTML or plain text
+
+=head1 SYNOPSIS
+
+unfb2 [-h] [-l] [-x] [-2] [filename.fb2]
+
+=head1 DESCRIPTION
+
+unfb2 converts a FB2 book to HTML or text. This is a "quick & dirty"
+conversion. No attempt to save the metadata in the <description> of
+the document is made. Binary objects (e.g. images) are not included in
+the output.
+
+By default, unfb2 converts <filename>.fb2 to <filename>.txt. With the
+B<-h> option, the output is <filename>.html. The output file is always
+in the current directory; if it  already exists, unfb2 will refuse to
+overwrite it without the B<-f> option.
+
+When converting to text, the FB2 is first converted to HTML, then run
+through one of three html-to-text converters, depending on the option
+given (default is links).
+
+=head1 OPTIONS
+
+=over 4
+
+=item B<--help>
+
+You're reading it now.
+
+=item B<--man>
+
+Output the --help in troff format, suitable for use as a man page.
+
+=item B<-f>
+
+Forcibly overwrite the output file if it already exists.
+
+=item B<-h>
+
+Convert to HTML rather than text.
+
+=item B<-l>
+
+Convert HTML to text with this command:
+
+  links -html-margin 0 -dump $file.html > file.txt
+
+This is actually the default, when no options are given.
+
+=item B<-x>
+
+Convert HTML to text this this command:
+
+  lynx -nomargins -dump $file.html > file.txt
+
+=item B<-2>
+
+Convert HTML to text this this command:
+
+  html2text $file.html > file.txt
+
+=back
+
+If you'd like to use a different HTML-to-text converter, just say -h
+for HTML output and run your converter separately.
+
+=head1 BUGS
+
+Only UTF-8 encoding is supported. Not sure anyone ever uses anything
+else with FictionBook, so it probably doesn't matter.
+
+The conversion is pretty dumb, and doesn't attempt to handle the full
+FB2 spec. links is good about ignoring unknown tags, so this works OK
+for text conversions.
+
+There's no support for compressed FB2 files. Extract it first.
+
+=head1 AUTHOR
+
+B. Watson <yalhcru@gmail.com>
+
+=head1 LICENSE
+
+unfb2 is released under the WTFPL: Do WTF you want with this.
+
+=head1 SEE ALSO
+
+links(1), lynx(1), html2text(1), convertlit(1), FBReader(1)
+
+=cut
+
+$VERSION=0.1;
+
+# we're going to use shell redirection to execute a command, so
+# best use temp filenames we *know* won't have shell metacharacters.
+sub mktmpfile {
+	my $ext = shift || "tmp";
+	return "unfb2_tmp_" . int(rand(2**32)) . "." . $ext;
+}
+
+($SELF = $0) =~ s,.*/,,;
+
+$outfmt = "txt";
+$convcmd = "links -html-margin 0 -dump";
+$overwrite = 0;
+
+while(@ARGV && $ARGV[0] =~ /^-./) {
+	my $opt = shift;
+
+	for($opt) {
+		/^--h(elp)?/   && do { exec("perldoc $0"); };
+		/^--man$/      && do { exec("pod2man --stderr -s1 -c\"Urchlay's Miscellany\" -r$VERSION $0"); };
+		/^-f$/         && do { $overwrite = 1;                     next; };
+		/^-h(html)?$/  && do { $outfmt = "html";                   next; };
+		/^-x$/         && do { $convcmd = "lynx -nomargins -dump"; next; };
+		/^-2$/         && do { $convcmd = "html2text";             next; };
+		/^-l$/         && next; # this was the default anyway
+
+		die("$SELF: unknown option $opt, try --help\n");
+	}
+}
+
+$infile = shift;
+if((not defined $infile) || ($infile eq '-')) {
+	die "$SELF: missing input filename (can't read from stdin, sorry)\n";
+}
+
+($outfile = $infile) =~ s/\.([^.]*)$/.$outfmt/;
+$outfile =~ s,.*/,,;
+
+if($infile eq $outfile) {
+	die "$SELF: can't read and write from the same file ($infile)\n";
+}
+
+if((!$overwrite) && (-e $outfile)) {
+	die("$SELF: output file $outfile already exists, not overwriting\n");
+}
+
+open $in, "<:encoding(UTF-8)", $infile or die "$SELF: $infile: $!\n";
+
+warn "$SELF: output file is $outfile\n";
+
+if($outfmt eq "txt") {
+	$htmlfile = mktmpfile("html");
+} else {
+	$htmlfile = $outfile;
+}
+
+open $out, ">:encoding(UTF-8)", $htmlfile or die "$SELF: $htmlfile: $!\n";
+
+$in_body = $in_binary = $fb_ok = 0;
+while(<$in>) {
+	$fb_ok++ if /<FictionBook/i;
+
+	if(!$in_body) {
+		next unless /<body\s*>/i;
+		print $out "<html><meta charset=\"UTF-8\"><body>";
+		$in_body = 1;
+	}
+
+	if(/<binary/) {
+		$in_binary = 1;
+	} elsif(/<\/binary/) {
+		$in_binary = 0;
+	}
+
+	s/<(|\/)section(.*?)>/<$1div$2>/ig; # TODO: something better?
+	s/<(|\/)?title(.*?)>/<$1center$2>/ig;
+	s/<empty-line(.*?)>/<br\/><br\/>/ig;
+	s/<\/FictionBook>/<\/html>/ig;
+
+	print $out $_ unless $in_binary;
+}
+
+close $out;
+
+if($fb_ok != 1) {
+	warn "$SELF: input didn't look like valid FB2, output may be bogus\n";
+}
+
+if($outfmt eq "txt") {
+	$tmpfile = mktmpfile("txt");
+
+	system("$convcmd $htmlfile > $tmpfile");
+
+	unlink($htmlfile);
+	unlink($outfile);
+	link($tmpfile, $outfile) || die "$SELF: link($tmpfile, $outfile): $!\n";
+	unlink($tmpfile);
+}
author	B. Watson <yalhcru@gmail.com>	2020-05-28 01:13:23 -0400
committer	B. Watson <yalhcru@gmail.com>	2020-05-28 01:13:23 -0400
commit	722e9a7b6816c960ff1e02b352dfb909983e8f1b (patch)
tree	d27364345968c00696139d2ccdcbc8f14d34722b
parent	923f8f45a32edef8b812948f9fe15b84634575c5 (diff)
download	misc-scripts-722e9a7b6816c960ff1e02b352dfb909983e8f1b.tar.gz