#!/usr/bin/env perl

# Copyright (c) 2009, Douglas Haber
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#     * Redistributions of source code must retain the above copyright
#       notice, this list of conditions and the following disclaimer.
#     * Redistributions in binary form must reproduce the above 
#       copyright notice, this list of conditions and the following
#       disclaimer in the documentation and/or other materials provided
#       with the distribution.
#     * The names names of the authors may not be used to endorse or
#       promote products derived from this software without specific
#       prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND ITS
# CONTRIBUTERS ''AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
# INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR ITS
# CONTRIBUTERS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
# SUCH DAMAGE.


# ***** GWF
# *****   Get Web Files
# *****   Green Watermellon Factory
# *****   Gorglegon Waste Facility

# This script will get all linked files that match the regex and
# save them into a temp directory.  If no regex is provided a list
# of files will be printed, but no files will be downloaded.

# WARNING:  This is a quick and nasty hack!
#           This gives unnecessary errors!
#           This mostly works!
#           This can probably be done better with wget options!

# Usage:  gwf [regexp] [url] ...

$BROWSER_ID = 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)';

sub valid_regex { 
    print "processing '$_[0]\n";
    eval {   
	"" =~ /$_[0]/; 
	return(1);    
    }
    or
	return(0);
}

sub main {
    my ($page);
    my $c = 0;
    my $cn = 0;
 
    if($#ARGV == -1) { die("ERROR: Usage: gwf [regexp] [url] ...\n"); }
    $regexp = shift(@ARGV);

    if($#ARGV == 1) {
	if(!valid_regex($regexp)) {
	    die "ERROR:  Invalid regular expression!\n"; 
	} 
    }

    while($url = shift(@ARGV)) {
	print "Getting: $url\n"; 
	$page = `wget --referer='$url' -U'$BROWSER_ID' -O - --quiet $url`;
	if(!$page) {
	    die "ERROR:  Unable to get page!\n"; 
	}
        
	if($regexp) {
	    $dir ="ftmp-" . ($$ - $cn);
	    $cn++;
	    print "Storing files in $dir\n";
	
	    mkdir "$dir";
	    chdir "$dir";
	    `echo Files from '$url' > SOURCE`;
	}
    
	# attempt to remove page name if it is there
	$url =~ s/\/[\w\s\-\.]+.html?$/\//;
	# remove args
	$url =~ s/\?.*$//;
	# remove trailing slash
	$url =~ s/\/$//;
    
	while($page =~ /HREF\s*=\s*[\"\']?([^\"\']*)[\"\']?/gsi) {
	    my $link=$1;
	    $c++;
	    
	    $link =~ s/^\/\//http:\/\//;    
	    if(!$regexp) { print "[$c] $link\n"; }
	    else {
		if($link=~/$regexp/is) {
		    if($link !~ /^https?:\/\//)  { $link = "$url/$link"; }
		    print "[$c] downloading '$link'\n";
		    system("wget --quiet '$link'");
		    if($?>>8) {
			print "WARNING:  wget gave an error! [ec=". ($?>>8) ."]\n"; 
			exit(1); 
		    }
		}
		else { $c--; }
	    }
	}
	chdir "..";
	print "running 'ls -l $dir'\n";
	system("ls -l $dir");
    }
}

main();