From 1ca17096c09d824854a66715040e8582ec36f9e9 Mon Sep 17 00:00:00 2001 From: "B. Watson" Date: Wed, 21 Oct 2015 18:12:28 -0400 Subject: curl fixes --- sbosrcarch | 44 +++++++++++++++++++++++++------------------- sbosrcarch.conf | 9 +-------- 2 files changed, 26 insertions(+), 27 deletions(-) diff --git a/sbosrcarch b/sbosrcarch index 676be36..b0f9d86 100755 --- a/sbosrcarch +++ b/sbosrcarch @@ -35,7 +35,10 @@ # Ideas for future features: # - autopurge option for update. It only needs to purge the dirs that -# got updated, so should be quick. +# got updated, so should be quick.... except what happens if two builds +# use the same source file, one gets updated and the other doesn't? if +# the purge doesn't parse all the info files in the repo, it can't know +# not to delete the by-md5 in that case. Ugh. =pod @@ -93,8 +96,8 @@ large files are mostly games, if that influences your decision any. =item B<-c> I Read specified config file instead of searching in the default locations -for it. See B section below for default. This option must appear -first on the command line. +for it. See B section below for default. This option must +appear first on the command line, if used. =item B @@ -129,7 +132,7 @@ more resource-intensive than an update, as it must read and parse every .info file in the SBo repository. If -r or --rebuild is given, the entire by-md5 tree is deleted and recreated. -This shouldn't be needed unless $symlinks is changed. +This shouldn't be needed unless $symlinks (see B) is changed. =item B @@ -328,12 +331,6 @@ processes in the background, but I'm not going to complicate it that way. It would mainly be useful for create mode, and hopefully each archive site only needs to do that once. -There maybe should be a whitelist and a blacklist. The whitelist would be -a list of builds (or entire categories) that you want to mirror all of, -regardless of file size limits. The blacklist would be a list of builds -or categories you don't want to mirror, ever. Probably I won't add this -unless multiple people ask for it. - Anything that checks referer header or otherwise tries to stop automated downloads, will stop us. This isn't really a bug (sbopkg can't handle them either). Usually the README will say "you must download the file @@ -343,9 +340,12 @@ to the archive... but please pay attention to licensing! Some files (e.g. Oracle's Java) don't allow redistribution, so please don't include them in your archive. -Length: unspecified isn't handled (we just don't download these). Might -add an option that controls what to do about these, e.g. download & -keep them all instead of ignoring them all. Can still add them manually. +For URLs that won't give us a Content-Length header, we can't determine +the file size. If $maxfilemegs is zero (unlimited), this doesn't +matter: everything gets downloaded. If there's a size limit, and we +can't determine the size, we don't download these... unless they're +whitelisted. They can still be added manually, either with the -f option +or by downloading them separately and adding them as local files. =head1 AUTHOR @@ -590,18 +590,23 @@ sub curl_download_http { my $tries = 1 + ($url =~ /github\.com/); for(1..$tries) { - open my $fh, "$curl $curlopts " . + my $cmd = + "$curl $curlopts " . user_agent($url) . " --head -X GET " . wget_quote_url($url) . - " 2>$outfile |" - or die $!; + " 2>$outfile |"; +# warn "$cmd\n"; + open my $fh, $cmd or die $!; local $/ = "\r\n"; while(<$fh>) { chomp; +# print "$_\n"; $httpstatus = $1, $httpstatusline = $_ if /^HTTP\/\S+\s+(\d+)/; $size = $1 if /^Content-Length:\s+(\d+)/; +# warn "$httpstatus" if $httpstatus; +# warn "$size" if $size; } close $fh; last if $size; @@ -626,6 +631,7 @@ sub curl_download_http { return undef; } elsif(toobig($size)) { printf "+ file too large: %0.2fMB\n", $size / (1024 * 1024); + $skipcount++; return undef; } } @@ -640,7 +646,7 @@ sub curl_download_http { " > $outfile 2>&1"); if($retval != 0) { - open my $fh, ") { print " ! $_"; } @@ -1022,13 +1028,13 @@ sub handle_info_file { } if(! -f $filename) { $failcount++; - print " not downloaded\n"; + print "! not downloaded\n"; next; } if(md5sum_file($filename) ne $md5) { $failcount++; - print " md5sum failed\n"; + print "! md5sum failed\n"; unlink($filename); next; } diff --git a/sbosrcarch.conf b/sbosrcarch.conf index 042ae2e..c4a7d5f 100644 --- a/sbosrcarch.conf +++ b/sbosrcarch.conf @@ -125,14 +125,7 @@ $curl = "curl"; # actually download a file, other options will be added to these (but # nothing you should have to mess with). -$curlopts = qw( - -K/dev/null - --insecure - --connect-timeout 60 - --head - -L - -sS -); +$curlopts = "-K/dev/null --insecure -L -sS --connect-timeout 60"; ##### wget options (only used if $use_curl is false) -- cgit v1.2.3