aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorB. Watson <yalhcru@gmail.com>2020-05-28 01:13:23 -0400
committerB. Watson <yalhcru@gmail.com>2020-05-28 01:13:23 -0400
commit722e9a7b6816c960ff1e02b352dfb909983e8f1b (patch)
treed27364345968c00696139d2ccdcbc8f14d34722b
parent923f8f45a32edef8b812948f9fe15b84634575c5 (diff)
downloadmisc-scripts-722e9a7b6816c960ff1e02b352dfb909983e8f1b.tar.gz
add unfb2: convert fictionbook to text or html
-rwxr-xr-xunfb2197
1 files changed, 197 insertions, 0 deletions
diff --git a/unfb2 b/unfb2
new file mode 100755
index 0000000..6406b3a
--- /dev/null
+++ b/unfb2
@@ -0,0 +1,197 @@
+#!/usr/bin/perl -w
+
+=pod
+
+=head1 NAME
+
+unfb2 - convert FB2 (FictionBook) files to HTML or plain text
+
+=head1 SYNOPSIS
+
+unfb2 [-h] [-l] [-x] [-2] [filename.fb2]
+
+=head1 DESCRIPTION
+
+unfb2 converts a FB2 book to HTML or text. This is a "quick & dirty"
+conversion. No attempt to save the metadata in the <description> of
+the document is made. Binary objects (e.g. images) are not included in
+the output.
+
+By default, unfb2 converts <filename>.fb2 to <filename>.txt. With the
+B<-h> option, the output is <filename>.html. The output file is always
+in the current directory; if it already exists, unfb2 will refuse to
+overwrite it without the B<-f> option.
+
+When converting to text, the FB2 is first converted to HTML, then run
+through one of three html-to-text converters, depending on the option
+given (default is links).
+
+=head1 OPTIONS
+
+=over 4
+
+=item B<--help>
+
+You're reading it now.
+
+=item B<--man>
+
+Output the --help in troff format, suitable for use as a man page.
+
+=item B<-f>
+
+Forcibly overwrite the output file if it already exists.
+
+=item B<-h>
+
+Convert to HTML rather than text.
+
+=item B<-l>
+
+Convert HTML to text with this command:
+
+ links -html-margin 0 -dump $file.html > file.txt
+
+This is actually the default, when no options are given.
+
+=item B<-x>
+
+Convert HTML to text this this command:
+
+ lynx -nomargins -dump $file.html > file.txt
+
+=item B<-2>
+
+Convert HTML to text this this command:
+
+ html2text $file.html > file.txt
+
+=back
+
+If you'd like to use a different HTML-to-text converter, just say -h
+for HTML output and run your converter separately.
+
+=head1 BUGS
+
+Only UTF-8 encoding is supported. Not sure anyone ever uses anything
+else with FictionBook, so it probably doesn't matter.
+
+The conversion is pretty dumb, and doesn't attempt to handle the full
+FB2 spec. links is good about ignoring unknown tags, so this works OK
+for text conversions.
+
+There's no support for compressed FB2 files. Extract it first.
+
+=head1 AUTHOR
+
+B. Watson <yalhcru@gmail.com>
+
+=head1 LICENSE
+
+unfb2 is released under the WTFPL: Do WTF you want with this.
+
+=head1 SEE ALSO
+
+links(1), lynx(1), html2text(1), convertlit(1), FBReader(1)
+
+=cut
+
+$VERSION=0.1;
+
+# we're going to use shell redirection to execute a command, so
+# best use temp filenames we *know* won't have shell metacharacters.
+sub mktmpfile {
+ my $ext = shift || "tmp";
+ return "unfb2_tmp_" . int(rand(2**32)) . "." . $ext;
+}
+
+($SELF = $0) =~ s,.*/,,;
+
+$outfmt = "txt";
+$convcmd = "links -html-margin 0 -dump";
+$overwrite = 0;
+
+while(@ARGV && $ARGV[0] =~ /^-./) {
+ my $opt = shift;
+
+ for($opt) {
+ /^--h(elp)?/ && do { exec("perldoc $0"); };
+ /^--man$/ && do { exec("pod2man --stderr -s1 -c\"Urchlay's Miscellany\" -r$VERSION $0"); };
+ /^-f$/ && do { $overwrite = 1; next; };
+ /^-h(html)?$/ && do { $outfmt = "html"; next; };
+ /^-x$/ && do { $convcmd = "lynx -nomargins -dump"; next; };
+ /^-2$/ && do { $convcmd = "html2text"; next; };
+ /^-l$/ && next; # this was the default anyway
+
+ die("$SELF: unknown option $opt, try --help\n");
+ }
+}
+
+$infile = shift;
+if((not defined $infile) || ($infile eq '-')) {
+ die "$SELF: missing input filename (can't read from stdin, sorry)\n";
+}
+
+($outfile = $infile) =~ s/\.([^.]*)$/.$outfmt/;
+$outfile =~ s,.*/,,;
+
+if($infile eq $outfile) {
+ die "$SELF: can't read and write from the same file ($infile)\n";
+}
+
+if((!$overwrite) && (-e $outfile)) {
+ die("$SELF: output file $outfile already exists, not overwriting\n");
+}
+
+open $in, "<:encoding(UTF-8)", $infile or die "$SELF: $infile: $!\n";
+
+warn "$SELF: output file is $outfile\n";
+
+if($outfmt eq "txt") {
+ $htmlfile = mktmpfile("html");
+} else {
+ $htmlfile = $outfile;
+}
+
+open $out, ">:encoding(UTF-8)", $htmlfile or die "$SELF: $htmlfile: $!\n";
+
+$in_body = $in_binary = $fb_ok = 0;
+while(<$in>) {
+ $fb_ok++ if /<FictionBook/i;
+
+ if(!$in_body) {
+ next unless /<body\s*>/i;
+ print $out "<html><meta charset=\"UTF-8\"><body>";
+ $in_body = 1;
+ }
+
+ if(/<binary/) {
+ $in_binary = 1;
+ } elsif(/<\/binary/) {
+ $in_binary = 0;
+ }
+
+ s/<(|\/)section(.*?)>/<$1div$2>/ig; # TODO: something better?
+ s/<(|\/)?title(.*?)>/<$1center$2>/ig;
+ s/<empty-line(.*?)>/<br\/><br\/>/ig;
+ s/<\/FictionBook>/<\/html>/ig;
+
+ print $out $_ unless $in_binary;
+}
+
+close $out;
+
+if($fb_ok != 1) {
+ warn "$SELF: input didn't look like valid FB2, output may be bogus\n";
+}
+
+if($outfmt eq "txt") {
+ $tmpfile = mktmpfile("txt");
+
+ system("$convcmd $htmlfile > $tmpfile");
+
+ unlink($htmlfile);
+ unlink($outfile);
+ link($tmpfile, $outfile) || die "$SELF: link($tmpfile, $outfile): $!\n";
+ unlink($tmpfile);
+}