Perl:LWP 模块

Perl 模块 LWP 自动下载网页文件代码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
#!/usr/bin/perl

# usage:
# put this script and the index file in the same directory
# creat a directory for your files, e.g. C:/corpus
# and run the following command:
# perl downloader.pl C:/corpus 1 10
# all the files will be saved in the directory C:/corpus
# check the error_log for anything that goes wrong

use strict;
use warnings;

use File::Spec;
use LWP;

my $ua = LWP::UserAgent->new;
my $dir = shift @ARGV;
my $start_id = shift @ARGV;
my $end_id = shift @ARGV;

open my $INDEX, '<', 'index';
open my $LOG, '>', 'error_log';

while ( defined(my $row = <$INDEX>) ) {
next if $. < $start_id + 1; # skip index header
last if $. > $end_id + 1;
chomp $row;
my @records = split /\t/, $row;
my $url = $records[3]; # get url
my $id = $records[5]; # get id

$url = "http://www.thesite.com/Archives/" . $url;
my $out_fn = File::Spec->catfile($dir, sprintf("%06d", $id));
open my $OUT, '>', $out_fn;

print 'Downloading file ', $id, "\n";
print 'url: ', $url, "\n";
print 'save to: ', $out_fn, "\n";

my $response = $ua->get($url);
if ( $response->is_success ) {
print $OUT $response->content;
print 'Done!', "\n";
} else {
print $LOG $id, "\t", 'Error in ', $url, "\n";
print 'Error!! Please check error_log for more information.', "\n";
}
}

__END__