diff options
author | B. Watson <yalhcru@gmail.com> | 2020-05-28 01:13:23 -0400 |
---|---|---|
committer | B. Watson <yalhcru@gmail.com> | 2020-05-28 01:13:23 -0400 |
commit | 722e9a7b6816c960ff1e02b352dfb909983e8f1b (patch) | |
tree | d27364345968c00696139d2ccdcbc8f14d34722b | |
parent | 923f8f45a32edef8b812948f9fe15b84634575c5 (diff) | |
download | misc-scripts-722e9a7b6816c960ff1e02b352dfb909983e8f1b.tar.gz |
add unfb2: convert fictionbook to text or html
-rwxr-xr-x | unfb2 | 197 |
1 files changed, 197 insertions, 0 deletions
@@ -0,0 +1,197 @@ +#!/usr/bin/perl -w + +=pod + +=head1 NAME + +unfb2 - convert FB2 (FictionBook) files to HTML or plain text + +=head1 SYNOPSIS + +unfb2 [-h] [-l] [-x] [-2] [filename.fb2] + +=head1 DESCRIPTION + +unfb2 converts a FB2 book to HTML or text. This is a "quick & dirty" +conversion. No attempt to save the metadata in the <description> of +the document is made. Binary objects (e.g. images) are not included in +the output. + +By default, unfb2 converts <filename>.fb2 to <filename>.txt. With the +B<-h> option, the output is <filename>.html. The output file is always +in the current directory; if it already exists, unfb2 will refuse to +overwrite it without the B<-f> option. + +When converting to text, the FB2 is first converted to HTML, then run +through one of three html-to-text converters, depending on the option +given (default is links). + +=head1 OPTIONS + +=over 4 + +=item B<--help> + +You're reading it now. + +=item B<--man> + +Output the --help in troff format, suitable for use as a man page. + +=item B<-f> + +Forcibly overwrite the output file if it already exists. + +=item B<-h> + +Convert to HTML rather than text. + +=item B<-l> + +Convert HTML to text with this command: + + links -html-margin 0 -dump $file.html > file.txt + +This is actually the default, when no options are given. + +=item B<-x> + +Convert HTML to text this this command: + + lynx -nomargins -dump $file.html > file.txt + +=item B<-2> + +Convert HTML to text this this command: + + html2text $file.html > file.txt + +=back + +If you'd like to use a different HTML-to-text converter, just say -h +for HTML output and run your converter separately. + +=head1 BUGS + +Only UTF-8 encoding is supported. Not sure anyone ever uses anything +else with FictionBook, so it probably doesn't matter. + +The conversion is pretty dumb, and doesn't attempt to handle the full +FB2 spec. links is good about ignoring unknown tags, so this works OK +for text conversions. + +There's no support for compressed FB2 files. Extract it first. + +=head1 AUTHOR + +B. Watson <yalhcru@gmail.com> + +=head1 LICENSE + +unfb2 is released under the WTFPL: Do WTF you want with this. + +=head1 SEE ALSO + +links(1), lynx(1), html2text(1), convertlit(1), FBReader(1) + +=cut + +$VERSION=0.1; + +# we're going to use shell redirection to execute a command, so +# best use temp filenames we *know* won't have shell metacharacters. +sub mktmpfile { + my $ext = shift || "tmp"; + return "unfb2_tmp_" . int(rand(2**32)) . "." . $ext; +} + +($SELF = $0) =~ s,.*/,,; + +$outfmt = "txt"; +$convcmd = "links -html-margin 0 -dump"; +$overwrite = 0; + +while(@ARGV && $ARGV[0] =~ /^-./) { + my $opt = shift; + + for($opt) { + /^--h(elp)?/ && do { exec("perldoc $0"); }; + /^--man$/ && do { exec("pod2man --stderr -s1 -c\"Urchlay's Miscellany\" -r$VERSION $0"); }; + /^-f$/ && do { $overwrite = 1; next; }; + /^-h(html)?$/ && do { $outfmt = "html"; next; }; + /^-x$/ && do { $convcmd = "lynx -nomargins -dump"; next; }; + /^-2$/ && do { $convcmd = "html2text"; next; }; + /^-l$/ && next; # this was the default anyway + + die("$SELF: unknown option $opt, try --help\n"); + } +} + +$infile = shift; +if((not defined $infile) || ($infile eq '-')) { + die "$SELF: missing input filename (can't read from stdin, sorry)\n"; +} + +($outfile = $infile) =~ s/\.([^.]*)$/.$outfmt/; +$outfile =~ s,.*/,,; + +if($infile eq $outfile) { + die "$SELF: can't read and write from the same file ($infile)\n"; +} + +if((!$overwrite) && (-e $outfile)) { + die("$SELF: output file $outfile already exists, not overwriting\n"); +} + +open $in, "<:encoding(UTF-8)", $infile or die "$SELF: $infile: $!\n"; + +warn "$SELF: output file is $outfile\n"; + +if($outfmt eq "txt") { + $htmlfile = mktmpfile("html"); +} else { + $htmlfile = $outfile; +} + +open $out, ">:encoding(UTF-8)", $htmlfile or die "$SELF: $htmlfile: $!\n"; + +$in_body = $in_binary = $fb_ok = 0; +while(<$in>) { + $fb_ok++ if /<FictionBook/i; + + if(!$in_body) { + next unless /<body\s*>/i; + print $out "<html><meta charset=\"UTF-8\"><body>"; + $in_body = 1; + } + + if(/<binary/) { + $in_binary = 1; + } elsif(/<\/binary/) { + $in_binary = 0; + } + + s/<(|\/)section(.*?)>/<$1div$2>/ig; # TODO: something better? + s/<(|\/)?title(.*?)>/<$1center$2>/ig; + s/<empty-line(.*?)>/<br\/><br\/>/ig; + s/<\/FictionBook>/<\/html>/ig; + + print $out $_ unless $in_binary; +} + +close $out; + +if($fb_ok != 1) { + warn "$SELF: input didn't look like valid FB2, output may be bogus\n"; +} + +if($outfmt eq "txt") { + $tmpfile = mktmpfile("txt"); + + system("$convcmd $htmlfile > $tmpfile"); + + unlink($htmlfile); + unlink($outfile); + link($tmpfile, $outfile) || die "$SELF: link($tmpfile, $outfile): $!\n"; + unlink($tmpfile); +} |