package PerlIO::via::StripHTML;
require 5.008;
use strict;
use warnings;
use HTML::Parser 3.00;
our $VERSION = 0.04;
sub PUSHED {
my ($class, $mode) = @_;
return -1 if $mode ne 'r';
# The following variables are updated / accessed via the closures below
my $buffer = ''; # internal buffer for this layer
my %inside = ();
bless {
buffer => sub : lvalue { $buffer },
parser => new HTML::Parser(
api_version => 3,
marked_sections => 1,
start_h => [
sub {
$buffer .= "\n" if $_[0] =~ /^[bt]r$/;
$buffer .= "\n\n" if $_[0] eq 'p';
++$inside{$_[0]};
},
'tagname',
],
end_h => [
sub { --$inside{$_[0]} },
'tagname',
],
text_h => [
sub {
$buffer .= $_[0] unless $inside{script} || $inside{style};
},
'dtext',
],
),
}, $class;
}
sub FILL {
my ($self, $fh) = @_;
my $line = <$fh>;
return undef unless defined $line;
$self->{buffer}->() = '';
$self->{parser}->parse($line) or return undef;
$self->{parser}->eof;
return $self->{buffer}->();
}
1;
__END__
=head1 NAME
PerlIO::via::StripHTML - PerlIO layer to strip HTML tags from an input file
=head1 SYNOPSIS
use PerlIO::via::StripHTML;
open my $file, '<:via(StripHTML)', 'foo.html'
or die "Can't open foo.html: $!\n";
=head1 DESCRIPTION
This package implements a PerlIO layer, for reading files only. It
strips HTML tags from the input, leaving only plain text. This can be
useful, for example, to find something in the text of a HTML page.
=head1 BUGS
This is only a preliminary version.
=head1 SEE ALSO
PerlIO::via
=head1 AUTHOR
Copyright (c) 2002 Rafael Garcia-Suarez. All rights reserved. This
program is free software; you can redistribute it and/or modify it under
the same terms as Perl itself.
The HTML stripping code was borrowed from the F script in the
C distribution.
=cut