Fix HTML file download.
This commit is contained in:
@@ -19,6 +19,7 @@ distributions):
|
|||||||
- `File::Spec` - For file path operations
|
- `File::Spec` - For file path operations
|
||||||
- `File::HomeDir` - For home directory detection
|
- `File::HomeDir` - For home directory detection
|
||||||
- `File::Path` - For creating cache directory structures
|
- `File::Path` - For creating cache directory structures
|
||||||
|
- `Math::Base36` - For calculating the name of the HTML files
|
||||||
|
|
||||||
## Licensing
|
## Licensing
|
||||||
|
|
||||||
|
|||||||
31
ccc
31
ccc
@@ -29,6 +29,7 @@ use JSON::PP;
|
|||||||
use File::Spec;
|
use File::Spec;
|
||||||
use File::HomeDir;
|
use File::HomeDir;
|
||||||
use File::Path qw(make_path);
|
use File::Path qw(make_path);
|
||||||
|
use Math::Base36 ':all';
|
||||||
|
|
||||||
my $arg = shift or usage();
|
my $arg = shift or usage();
|
||||||
|
|
||||||
@@ -118,40 +119,42 @@ sub build_section_map {
|
|||||||
|
|
||||||
print STDERR "Fetching Catechism sections...\n";
|
print STDERR "Fetching Catechism sections...\n";
|
||||||
|
|
||||||
# Build list of all possible HTML files based on hex naming
|
# Build list of all possible HTML files from base36 naming
|
||||||
# Files are named __P1.HTM, __P2.HTM... __P9.HTM, __PA.HTM... __PF.HTM, __P10.HTM, etc.
|
|
||||||
my @filenames;
|
my @filenames;
|
||||||
for my $i (1..1000) {
|
my $total_files = decode_base36('AE');
|
||||||
my $hex = sprintf("%X", $i);
|
for my $i (1..$total_files) {
|
||||||
push @filenames, "__P$hex.HTM";
|
my $b36 = encode_base36($i);
|
||||||
|
push @filenames, "__P$b36.HTM";
|
||||||
}
|
}
|
||||||
|
|
||||||
my $count = 0;
|
my $count = 0;
|
||||||
my $consecutive_404s = 0;
|
my $consecutive_404s = 0;
|
||||||
foreach my $filename (@filenames) {
|
foreach my $filename (@filenames) {
|
||||||
my $url = "$base_url/$filename";
|
my $url = "$base_url/$filename";
|
||||||
|
retry_get:
|
||||||
my $response = $ua->get($url);
|
my $response = $ua->get($url);
|
||||||
|
|
||||||
unless($response->is_success) {
|
unless($response->is_success) {
|
||||||
# Count consecutive 404s, stop after too many
|
|
||||||
$consecutive_404s++;
|
$consecutive_404s++;
|
||||||
next if $consecutive_404s < 10; # Allow up to 9 consecutive 404s
|
goto retry_get if $consecutive_404s < 3;
|
||||||
last;
|
|
||||||
|
print STDERR "\n404 response on '$url'.\n";
|
||||||
|
exit 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
$consecutive_404s = 0; # Reset counter on success
|
$consecutive_404s = 0;
|
||||||
|
|
||||||
my $content = $response->content;
|
my $content = $response->content;
|
||||||
|
|
||||||
# Extract all section numbers
|
|
||||||
my @sections = extract_section_numbers($content);
|
my @sections = extract_section_numbers($content);
|
||||||
|
|
||||||
foreach my $section (@sections) {
|
foreach my $section (@sections) {
|
||||||
$section_map{$section} = $filename;
|
$section_map{$section} = $filename;
|
||||||
$count++;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
$count++;
|
||||||
|
|
||||||
# Print progress
|
# Print progress
|
||||||
print STDERR "." if(scalar(keys %section_map) % 50 == 0);
|
my $progress = $count * 100 / $total_files;
|
||||||
|
print STDERR "\r$progress%";
|
||||||
}
|
}
|
||||||
|
|
||||||
print STDERR "\n";
|
print STDERR "\n";
|
||||||
|
|||||||
Reference in New Issue
Block a user