Fix regex for finding sections.

This commit is contained in:
2026-04-01 11:04:21 +02:00
parent 184b418d5b
commit ede7007fbd

15
ccc
View File

@@ -168,19 +168,12 @@ sub extract_section_numbers {
my ($html) = @_; my ($html) = @_;
my @sections; my @sections;
# Look for section numbers that appear after a <p> tag # Find section header numbers with Windows lines
# Windows line endings (\r\n) are used in the HTML while($html =~ /\r\n<[p|P] class=MsoNormal[^>]*>(<i[^>]*>)?(\d{1,4})/g) {
# Example: <p class=MsoNormal>199\r\n&quot;I believe in God... push @sections, $2;
while ($html =~ /<p[^>]*>(\d{1,4})[\r\n]+/g) {
my $num = $1;
# Only capture numbers in the valid CCC range (1-2865)
push @sections, $num if $num >= 1 && $num <= 3000;
} }
# Remove duplicates and return sorted return @sections;
my %seen;
my @unique = grep { !$seen{$_}++ } @sections;
return sort {$a <=> $b} @unique;
} }
sub fetch_and_display_section { sub fetch_and_display_section {