Fix regex for finding sections.
This commit is contained in:
15
ccc
15
ccc
@@ -168,19 +168,12 @@ sub extract_section_numbers {
|
|||||||
my ($html) = @_;
|
my ($html) = @_;
|
||||||
my @sections;
|
my @sections;
|
||||||
|
|
||||||
# Look for section numbers that appear after a <p> tag
|
# Find section header numbers with Windows lines
|
||||||
# Windows line endings (\r\n) are used in the HTML
|
while($html =~ /\r\n<[p|P] class=MsoNormal[^>]*>(<i[^>]*>)?(\d{1,4})/g) {
|
||||||
# Example: <p class=MsoNormal>199\r\n"I believe in God...
|
push @sections, $2;
|
||||||
while ($html =~ /<p[^>]*>(\d{1,4})[\r\n]+/g) {
|
|
||||||
my $num = $1;
|
|
||||||
# Only capture numbers in the valid CCC range (1-2865)
|
|
||||||
push @sections, $num if $num >= 1 && $num <= 3000;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
# Remove duplicates and return sorted
|
return @sections;
|
||||||
my %seen;
|
|
||||||
my @unique = grep { !$seen{$_}++ } @sections;
|
|
||||||
return sort {$a <=> $b} @unique;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
sub fetch_and_display_section {
|
sub fetch_and_display_section {
|
||||||
|
|||||||
Reference in New Issue
Block a user