#!/usr/bin/perl -w =pod =head1 NAME unfb2 - convert FB2 (FictionBook) files to HTML or plain text =head1 SYNOPSIS unfb2 [-h] [-l] [-x] [-2] [filename.fb2] =head1 DESCRIPTION unfb2 converts a FB2 book to HTML or text. This is a "quick & dirty" conversion. No attempt to save the metadata in the of the document is made. Binary objects (e.g. images) are not included in the output. By default, unfb2 converts .fb2 to .txt. With the B<-h> option, the output is .html. The output file is always in the current directory; if it already exists, unfb2 will refuse to overwrite it without the B<-f> option. When converting to text, the FB2 is first converted to HTML, then run through one of three html-to-text converters, depending on the option given (default is links). =head1 OPTIONS =over 4 =item B<--help> You're reading it now. =item B<--man> Output the --help in troff format, suitable for use as a man page. =item B<-f> Forcibly overwrite the output file if it already exists. =item B<-h> Convert to HTML rather than text. =item B<-l> Convert HTML to text with this command: links -html-margin 0 -dump $file.html > file.txt This is actually the default, when no options are given. =item B<-x> Convert HTML to text this this command: lynx -nomargins -dump $file.html > file.txt =item B<-2> Convert HTML to text this this command: html2text $file.html > file.txt =back If you'd like to use a different HTML-to-text converter, just say -h for HTML output and run your converter separately. =head1 BUGS Only UTF-8 encoding is supported. Not sure anyone ever uses anything else with FictionBook, so it probably doesn't matter. The conversion is pretty dumb, and doesn't attempt to handle the full FB2 spec. links is good about ignoring unknown tags, so this works OK for text conversions. There's no support for compressed FB2 files. Extract it first. =head1 AUTHOR B. Watson =head1 LICENSE unfb2 is released under the WTFPL: Do WTF you want with this. =head1 SEE ALSO links(1), lynx(1), html2text(1), convertlit(1), FBReader(1) =cut $VERSION=0.1; # we're going to use shell redirection to execute a command, so # best use temp filenames we *know* won't have shell metacharacters. sub mktmpfile { my $ext = shift || "tmp"; return "unfb2_tmp_" . int(rand(2**32)) . "." . $ext; } ($SELF = $0) =~ s,.*/,,; $outfmt = "txt"; $convcmd = "links -html-margin 0 -dump"; $overwrite = 0; while(@ARGV && $ARGV[0] =~ /^-./) { my $opt = shift; for($opt) { /^--h(elp)?/ && do { exec("perldoc $0"); }; /^--man$/ && do { exec("pod2man --stderr -s1 -c\"Urchlay's Miscellany\" -r$VERSION $0"); }; /^-f$/ && do { $overwrite = 1; next; }; /^-h(html)?$/ && do { $outfmt = "html"; next; }; /^-x$/ && do { $convcmd = "lynx -nomargins -dump"; next; }; /^-2$/ && do { $convcmd = "html2text"; next; }; /^-l$/ && next; # this was the default anyway die("$SELF: unknown option $opt, try --help\n"); } } $infile = shift; if((not defined $infile) || ($infile eq '-')) { die "$SELF: missing input filename (can't read from stdin, sorry)\n"; } ($outfile = $infile) =~ s/\.([^.]*)$/.$outfmt/; $outfile =~ s,.*/,,; if($infile eq $outfile) { die "$SELF: can't read and write from the same file ($infile)\n"; } if((!$overwrite) && (-e $outfile)) { die("$SELF: output file $outfile already exists, not overwriting\n"); } open $in, "<:encoding(UTF-8)", $infile or die "$SELF: $infile: $!\n"; warn "$SELF: output file is $outfile\n"; if($outfmt eq "txt") { $htmlfile = mktmpfile("html"); } else { $htmlfile = $outfile; } open $out, ">:encoding(UTF-8)", $htmlfile or die "$SELF: $htmlfile: $!\n"; $in_body = $in_binary = $fb_ok = 0; while(<$in>) { $fb_ok++ if //i; print $out ""; $in_body = 1; } if(//<$1div$2>/ig; # TODO: something better? s/<(|\/)?title(.*?)>/<$1center$2>/ig; s///ig; s/<\/FictionBook>/<\/html>/ig; print $out $_ unless $in_binary; } close $out; if($fb_ok != 1) { warn "$SELF: input didn't look like valid FB2, output may be bogus\n"; } if($outfmt eq "txt") { $tmpfile = mktmpfile("txt"); system("$convcmd $htmlfile > $tmpfile"); unlink($htmlfile); unlink($outfile); link($tmpfile, $outfile) || die "$SELF: link($tmpfile, $outfile): $!\n"; unlink($tmpfile); }