#!/usr/bin/env perl # Copyright (c) 2009, Douglas Haber # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above # copyright notice, this list of conditions and the following # disclaimer in the documentation and/or other materials provided # with the distribution. # * The names names of the authors may not be used to endorse or # promote products derived from this software without specific # prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND ITS # CONTRIBUTERS ''AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, # INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR ITS # CONTRIBUTERS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF # USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT # OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF # SUCH DAMAGE. # ***** GWF # ***** Get Web Files # ***** Green Watermellon Factory # ***** Gorglegon Waste Facility # This script will get all linked files that match the regex and # save them into a temp directory. If no regex is provided a list # of files will be printed, but no files will be downloaded. # WARNING: This is a quick and nasty hack! # This gives unnecessary errors! # This mostly works! # This can probably be done better with wget options! # Usage: gwf [regexp] [url] ... $BROWSER_ID = 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)'; sub valid_regex { print "processing '$_[0]\n"; eval { "" =~ /$_[0]/; return(1); } or return(0); } sub main { my ($page); my $c = 0; my $cn = 0; if($#ARGV == -1) { die("ERROR: Usage: gwf [regexp] [url] ...\n"); } $regexp = shift(@ARGV); if($#ARGV == 1) { if(!valid_regex($regexp)) { die "ERROR: Invalid regular expression!\n"; } } while($url = shift(@ARGV)) { print "Getting: $url\n"; $page = `wget --referer='$url' -U'$BROWSER_ID' -O - --quiet $url`; if(!$page) { die "ERROR: Unable to get page!\n"; } if($regexp) { $dir ="ftmp-" . ($$ - $cn); $cn++; print "Storing files in $dir\n"; mkdir "$dir"; chdir "$dir"; `echo Files from '$url' > SOURCE`; } # attempt to remove page name if it is there $url =~ s/\/[\w\s\-\.]+.html?$/\//; # remove args $url =~ s/\?.*$//; # remove trailing slash $url =~ s/\/$//; while($page =~ /HREF\s*=\s*[\"\']?([^\"\']*)[\"\']?/gsi) { my $link=$1; $c++; $link =~ s/^\/\//http:\/\//; if(!$regexp) { print "[$c] $link\n"; } else { if($link=~/$regexp/is) { if($link !~ /^https?:\/\//) { $link = "$url/$link"; } print "[$c] downloading '$link'\n"; system("wget --quiet '$link'"); if($?>>8) { print "WARNING: wget gave an error! [ec=". ($?>>8) ."]\n"; exit(1); } } else { $c--; } } } chdir ".."; print "running 'ls -l $dir'\n"; system("ls -l $dir"); } } main();