From c9b34791d6e725a77d0ce23e62ad57905f832f73 Mon Sep 17 00:00:00 2001 From: "B. Watson" Date: Wed, 30 Sep 2015 00:21:56 -0400 Subject: Add purge --rebuild --- sbosrcarch | 96 ++++++++++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 78 insertions(+), 18 deletions(-) (limited to 'sbosrcarch') diff --git a/sbosrcarch b/sbosrcarch index 6acd856..8479cd1 100755 --- a/sbosrcarch +++ b/sbosrcarch @@ -78,13 +78,16 @@ quickly and not eat many resources. For each new URL, the file is downloaded and added to the archive, but the old file is *not* deleted (use 'sbosrcarch purge' to do that). -=item purge +=item purge [-r|--rebuild] Purge files from the archive that are no longer referenced by any -.info file. Should be run monthly or quarterly as a cron job. This is +.info file. Should be run monthly or quarterly as a cron job. This is more resource-intensive than an update, as it must read and parse every .info file in the SBo repository. +If -r or --rebuild is given, the entire by-md5 tree is deleted and recreated. +This shouldn't be needed unless $symlinks is changed. + =item trim Gets rid of files that are in the archive, but are larger than the size @@ -93,9 +96,9 @@ reason to run it any other time. =item check -TODO: this is not yet implemented. +TODO: check is not yet implemented! -Checks the integrity and coverage of the archive: Will report at least these conditions: +Checks the integrity and coverage of the archive. Reports at least these conditions: - dangling symlinks - invalid md5sums @@ -145,7 +148,7 @@ B for details. The archive created by sbosrcarch consists of two top-level directories called B and B. All files are present in both hierarchies -(as hardlinked copies, to save space). +(as hard or symbolic links, to save space). B is organized by the familiar category and PRGNAM, like SBo itself. Example: @@ -162,7 +165,8 @@ file needed. The same file as the example above would be found at: by-md5/f/4/f4d413f880754fd6677290160f8bc5d7/ifstatus-v1.1.0.tar.gz Notice there are two layers of subdirectory, named after the first two -hex digits in the md5sum. +hex digits in the md5sum. Also, notice that the actual SlackBuilds and +.info files are not present in the archive. There is one other directory of files used/maintained by sbosrcarch: a git clone of SBo's master git branch. This is cloned and updated @@ -185,7 +189,7 @@ The git clone directory doesn't need to be served to the public. TODO: example Apache, proftpd, etc configs for serving up the archive. -=head1 EXAMPLE +=head1 CLIENT-SIDE EXAMPLE TODO: shell script that parses an .info file and tries to d/l the source from the archive. @@ -213,16 +217,32 @@ that I'm not planning to address any time soon: No threading. Not likely to change. It would be possible to spawn wget processes in the background, but I'm not going to complicate it that way. +It would mainly be useful for create mode, and hopefully each archive +site only needs to do that once. -Anything that checks referer header or otherwise tries to stop automated -downloads, will stop us. This isn't really a bug (sbopkg can't handle them -either). - -Length: unspecified isn't handled (we just don't download these). Specifically, -dropbox URLs do this. +There maybe should be a whitelist and a blacklist. The whitelist would be +a list of builds (or entire categories) that you want to mirror all of, +regardless of file size limits. The blacklist would be a list of builds +or categories you don't want to mirror, ever. Probably I won't add this +unless multiple people ask for it. -$sbogitdir and $archivedir must be located on the same filesystem, as files -are moved around by linking them. +Anything that checks referer header or otherwise tries to stop automated +downloads, will stop us. This isn't really a bug (sbopkg can't handle +them either). Usually the README will say "you must download the file +with a browser" or such. You can still download the file manually +and use "sbosrcarch add category/prgnam filename.tar.gz" to add it +to the archive... but please pay attention to licensing! Some files +(e.g. Oracle's Java) don't allow redistribution, so please don't include +them in your archive. + +Length: unspecified isn't handled (we just don't download +these). Specifically, dropbox URLs do this. Might add an option that +controls what to do about these, e.g. download & keep them all instead +of ignoring them all. Can still add them manually. + +$sbogitdir and $archivedir must be located on the same filesystem, +as files are moved around by linking them. Not a major problem, just +thought I'd mention it. =head1 AUTHOR @@ -236,7 +256,7 @@ use File::Find; use Digest::MD5; use Net::FTP; use POSIX 'getcwd'; -use File::Path 'make_path'; +use File::Path qw/make_path remove_tree/; use File::Copy 'copy'; sub read_config { @@ -715,10 +735,23 @@ sub update_mode { # category/name dir, but not mentioned in the filename hash (also, rm its # md5_dir() counterpart). # 3. do a trim_post() pass to delete any empty dirs and/or dangling symlinks +# If --rebuild is given, pass 3 instead deletes the by-md5 tree and +# recreates it. # FIXME: files from different URLs but with the same filename will not be # purged when they should, because the comparison is solely filename-based! sub purge_mode { + my $rebuild = 0; + + shift @ARGV; + if($ARGV[0]) { + if($ARGV[0] =~ /^--?r(?:ebuild)?/) { + $rebuild = 1; + } else { + die "Unknown option: $ARGV[0]\n"; + } + } + init_git(); $purgebytes = $purgefiles = 0; @@ -736,7 +769,13 @@ sub purge_mode { find({wanted => \&purge_pass_2_wanted, no_chdir => 1}, "by-name"); # pass 3 - trim_post(); + if($rebuild) { + remove_tree("by-md5"); + print "Removed by-md5 tree, rebuilding\n"; + find({wanted => \&rebuild_wanted, no_chdir => 1}, "by-name"); + } else { + trim_post(); + } printf("Purged $purgefiles files, %.1fMB\n", ($purgebytes / (1024 * 1024))); exit 0; @@ -764,10 +803,27 @@ sub purge_pass_2_wanted { $purgebytes += -s $_; $purgefiles++; - unlink md5_dir(md5sum_file($_)). "$file"; + unlink md5_dir(md5sum_file($_)). "/$file"; unlink $_; } +sub rebuild_wanted { + return unless -f; + + s,^\./,,; # remove leading ./ + my $md5dir = md5_dir(md5sum_file($_)); + my (undef, $category, $prgnam, $filename) = split /\//, $_; + + make_path($md5dir); + + if($symlinks) { + symlink("../../../../by-name/" . $category . "/" . $prgnam . "/" . $filename, + $md5dir . "/" . $filename); + } else { + link($_, $md5dir . "/" . $filename); + } +} + # helper for trim_mode sub trim_wanted { return unless -f $_; @@ -904,6 +960,9 @@ sub add_or_rm_mode { } } +sub check_mode { +} + sub usage { my $self = $0; $self =~ s,.*/,,; @@ -941,6 +1000,7 @@ for ($ARGV[0]) { /add/ && do { add_or_rm_mode(); }; /rm/ && do { add_or_rm_mode(); }; /trim/ && do { trim_mode(); }; + /check/ && do { check_mode(); }; usage(); } -- cgit v1.2.3