Created
November 14, 2012 09:30
-
-
Save onishi/4071196 to your computer and use it in GitHub Desktop.
HTMLまるごと保存
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
use strict; | |
use warnings; | |
use utf8; | |
use DateTime; | |
use Digest::SHA1 qw(sha1_hex); | |
use Encode; | |
use File::Path qw/make_path/; | |
use HTML::Parser; | |
use HTML::ResolveLink; | |
use HTTP::Request::Common qw/GET/; | |
use IO::All; | |
use LWP::UserAgent; | |
use URI; | |
my $path = './'; | |
my $uri = URI->new(shift) or die; | |
my $now = DateTime->now; | |
my $ymd = $now->ymd; | |
my $ua = LWP::UserAgent->new(agent => 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)'); | |
my $resolver = HTML::ResolveLink->new(base => $uri); | |
my $res = $ua->request(GET $uri); | |
my $content = $resolver->resolve($res->decoded_content); | |
my $dir = $uri; | |
$dir =~ s{[^A-Za-z0-9.]+}{-}g; | |
$dir =~ s{-+$}{}; | |
$dir = "$path/$dir/$ymd/"; | |
$dir =~ s{/+}{/}g; | |
make_path($dir); | |
my $disallow_tag = qr{script}; | |
my $nodisplay_tag = qr{noscript}; | |
my $result; | |
my $context = { disallow => 0 }; | |
my $parser = HTML::Parser->new( | |
api_version => 3, | |
start_h => [ | |
sub { | |
my($self, $tagname, $attr, $text) = @_; | |
if ($tagname =~ /^(?:$nodisplay_tag)$/i) { | |
return; | |
} elsif ($tagname =~ /^(?:$disallow_tag)$/i) { | |
$context->{disallow}++; | |
return; | |
} | |
$result .= "<$tagname"; | |
for my $key (sort keys %$attr) { | |
$key eq '/' and next; | |
my $value = $attr->{$key}; | |
if ($key =~ /^(?:src)$/i) { | |
$value = get_src($value); | |
} elsif ($tagname =~ /^(?:link)$/i and $key =~ /^(?:href)$/i) { | |
$value = get_link($value); | |
} elsif ($tagname =~ /^(?:base)$/i and $key =~ /^(?:href)$/i) { | |
$value = $path; | |
} | |
$result .= qq{ $key="$value"}; | |
} | |
$result .= ">"; | |
}, | |
'self,tagname,attr,text', | |
], | |
end_h => [ | |
sub { | |
my($self, $tagname, $text) = @_; | |
if ($tagname =~ /^(?:$nodisplay_tag)$/i) { | |
return; | |
} elsif ($tagname =~ /^(?:$disallow_tag)$/i) { | |
$context->{disallow}--; | |
return; | |
} | |
$result .= $text; | |
}, | |
'self,tagname,text', | |
], | |
default_h => [ | |
sub { | |
my($self, $text) = @_; | |
if ($context->{disallow} > 0) { | |
return; | |
} | |
$result .= $text; | |
}, | |
'self,text', | |
], | |
); | |
$parser->parse($content); | |
$result =~ s{(<head[^>]*>)}{$1<meta http-equiv="Content-Type" content="text/html; charset=utf-8">}i; # XXX | |
$result = Encode::encode('utf-8', $result); | |
$result > io("${dir}index.html"); | |
print "${dir}index.html\n"; | |
sub get_src { | |
my $src = shift or return; | |
unless (-e "${dir}file") { | |
make_path("${dir}file"); | |
} | |
my $file = $src; | |
$file =~ s{[^A-Za-z0-9.]+}{-}g; | |
if (length($file) > 255) { | |
$file = sha1_hex($file); | |
} | |
$file = "file/$file"; | |
$file =~ s{/+}{/}g; | |
unless (-e "$dir$file") { | |
$ua->request(GET $src)->content >> io("$dir$file"); | |
sleep(1); # DOS対策対策 | |
} | |
$file; | |
} | |
sub get_link { | |
my $url = shift or return; | |
my $file = get_src($url); | |
my $io = io("$dir$file"); | |
my $content = $io->slurp; | |
$content =~ s{url\(([^\)]+)\)}{ | |
my $link = $1; | |
$link =~ s{^[\s\"\']+}{}; | |
$link =~ s{[\s\"\']+$}{}; | |
# relative link (from HTML::ResolveLink) | |
my $u = URI->new($link); | |
unless (defined $u->scheme) { | |
my $old = $u; | |
$u = $u->abs($url); | |
} | |
$link = get_src($u); | |
$link =~ s{^file/}{}; | |
"url($link)"; | |
}eg; | |
$content > $io; | |
return $file; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
なんか適当に書いた。気が向いたらいい感じにモジュールとかにしたい