315 lines
7.2 KiB
Perl
315 lines
7.2 KiB
Perl
package HTML::FormatText;
|
|
|
|
# ABSTRACT: Format HTML as plaintext
|
|
|
|
|
|
use 5.006_001;
|
|
use strict;
|
|
use warnings;
|
|
|
|
# We now use Smart::Comments in place of the old DEBUG framework.
|
|
# this should be commented out in release versions....
|
|
##use Smart::Comments;
|
|
|
|
use base 'HTML::Formatter';
|
|
|
|
our $VERSION = '2.12'; # VERSION
|
|
our $AUTHORITY = 'cpan:NIGELM'; # AUTHORITY
|
|
|
|
# ------------------------------------------------------------------------
|
|
sub default_values {
|
|
( shift->SUPER::default_values(),
|
|
lm => 3, # left margin
|
|
rm => 72, # right margin (actually, maximum text width)
|
|
);
|
|
}
|
|
|
|
# ------------------------------------------------------------------------
|
|
sub configure {
|
|
my ( $self, $hash ) = @_;
|
|
|
|
my $lm = $self->{lm};
|
|
my $rm = $self->{rm};
|
|
|
|
$lm = delete $hash->{lm} if exists $hash->{lm};
|
|
$lm = delete $hash->{leftmargin} if exists $hash->{leftmargin};
|
|
$rm = delete $hash->{rm} if exists $hash->{rm};
|
|
$rm = delete $hash->{rightmargin} if exists $hash->{rightmargin};
|
|
|
|
my $width = $rm - $lm;
|
|
if ( $width < 1 ) {
|
|
warn "Bad margins, ignored" if $^W;
|
|
return;
|
|
}
|
|
if ( $width < 20 ) {
|
|
warn "Page probably too narrow" if $^W;
|
|
}
|
|
|
|
for ( keys %$hash ) {
|
|
warn "Unknown configure option '$_'" if $^W;
|
|
}
|
|
|
|
$self->{lm} = $lm;
|
|
$self->{rm} = $rm;
|
|
$self;
|
|
}
|
|
|
|
# ------------------------------------------------------------------------
|
|
sub begin {
|
|
my $self = shift;
|
|
|
|
$self->SUPER::begin;
|
|
$self->{curpos} = 0; # current output position.
|
|
$self->{maxpos} = 0; # highest value of $pos (used by header underliner)
|
|
$self->{hspace} = 0; # horizontal space pending flag
|
|
}
|
|
|
|
# ------------------------------------------------------------------------
|
|
sub end {
|
|
shift->collect("\n");
|
|
}
|
|
|
|
# ------------------------------------------------------------------------
|
|
sub header_start {
|
|
my ( $self, $level ) = @_;
|
|
|
|
$self->vspace( 1 + ( 6 - $level ) * 0.4 );
|
|
$self->{maxpos} = 0;
|
|
1;
|
|
}
|
|
|
|
# ------------------------------------------------------------------------
|
|
sub header_end {
|
|
my ( $self, $level ) = @_;
|
|
|
|
if ( $level <= 2 ) {
|
|
my $line;
|
|
$line = '=' if $level == 1;
|
|
$line = '-' if $level == 2;
|
|
$self->vspace(0);
|
|
$self->out( $line x ( $self->{maxpos} - $self->{lm} ) );
|
|
}
|
|
$self->vspace(1);
|
|
1;
|
|
}
|
|
|
|
# ------------------------------------------------------------------------
|
|
sub bullet {
|
|
my $self = shift;
|
|
|
|
$self->SUPER::bullet( $_[0] . ' ' );
|
|
}
|
|
|
|
# ------------------------------------------------------------------------
|
|
sub hr_start {
|
|
my $self = shift;
|
|
|
|
$self->vspace(1);
|
|
$self->out( '-' x ( $self->{rm} - $self->{lm} ) );
|
|
$self->vspace(1);
|
|
}
|
|
|
|
# ------------------------------------------------------------------------
|
|
sub pre_out {
|
|
my $self = shift;
|
|
|
|
# should really handle bold/italic etc.
|
|
if ( defined $self->{vspace} ) {
|
|
if ( $self->{out} ) {
|
|
$self->nl() while $self->{vspace}-- >= 0;
|
|
$self->{vspace} = undef;
|
|
}
|
|
}
|
|
my $indent = ' ' x $self->{lm};
|
|
my $pre = shift;
|
|
$pre =~ s/^/$indent/mg;
|
|
$self->collect($pre);
|
|
$self->{out}++;
|
|
}
|
|
|
|
# ------------------------------------------------------------------------
|
|
sub out {
|
|
my $self = shift;
|
|
my $text = shift;
|
|
|
|
$text =~ tr/\xA0\xAD/ /d;
|
|
|
|
if ( $text =~ /^\s*$/ ) {
|
|
$self->{hspace} = 1;
|
|
return;
|
|
}
|
|
|
|
if ( defined $self->{vspace} ) {
|
|
if ( $self->{out} ) {
|
|
$self->nl while $self->{vspace}-- >= 0;
|
|
}
|
|
$self->goto_lm;
|
|
$self->{vspace} = undef;
|
|
$self->{hspace} = 0;
|
|
}
|
|
|
|
if ( $self->{hspace} ) {
|
|
if ( $self->{curpos} + length($text) > $self->{rm} ) {
|
|
|
|
# word will not fit on line; do a line break
|
|
$self->nl;
|
|
$self->goto_lm;
|
|
}
|
|
else {
|
|
|
|
# word fits on line; use a space
|
|
$self->collect(' ');
|
|
++$self->{curpos};
|
|
}
|
|
$self->{hspace} = 0;
|
|
}
|
|
|
|
$self->collect($text);
|
|
my $pos = $self->{curpos} += length $text;
|
|
$self->{maxpos} = $pos if $self->{maxpos} < $pos;
|
|
$self->{'out'}++;
|
|
}
|
|
|
|
# ------------------------------------------------------------------------
|
|
sub goto_lm {
|
|
my $self = shift;
|
|
|
|
my $pos = $self->{curpos};
|
|
my $lm = $self->{lm};
|
|
if ( $pos < $lm ) {
|
|
$self->{curpos} = $lm;
|
|
$self->collect( " " x ( $lm - $pos ) );
|
|
}
|
|
}
|
|
|
|
# ------------------------------------------------------------------------
|
|
sub nl {
|
|
my $self = shift;
|
|
|
|
$self->{'out'}++;
|
|
$self->{curpos} = 0;
|
|
$self->collect("\n");
|
|
}
|
|
|
|
# ------------------------------------------------------------------------
|
|
sub adjust_lm {
|
|
my $self = shift;
|
|
|
|
$self->{lm} += $_[0];
|
|
$self->goto_lm;
|
|
}
|
|
|
|
# ------------------------------------------------------------------------
|
|
sub adjust_rm {
|
|
shift->{rm} += $_[0];
|
|
}
|
|
|
|
|
|
1;
|
|
|
|
__END__
|
|
|
|
=pod
|
|
|
|
=for test_synopsis 1;
|
|
__END__
|
|
|
|
=for stopwords latin1 leftmargin lm plaintext rightmargin rm CPAN homepage
|
|
|
|
=head1 NAME
|
|
|
|
HTML::FormatText - Format HTML as plaintext
|
|
|
|
=head1 VERSION
|
|
|
|
version 2.12
|
|
|
|
=head1 SYNOPSIS
|
|
|
|
use HTML::TreeBuilder;
|
|
$tree = HTML::TreeBuilder->new->parse_file("test.html");
|
|
|
|
use HTML::FormatText;
|
|
$formatter = HTML::FormatText->new(leftmargin => 0, rightmargin => 50);
|
|
print $formatter->format($tree);
|
|
|
|
or, more simply:
|
|
|
|
use HTML::FormatText;
|
|
my $string = HTML::FormatText->format_file(
|
|
'test.html',
|
|
leftmargin => 0, rightmargin => 50
|
|
);
|
|
|
|
=head1 DESCRIPTION
|
|
|
|
HTML::FormatText is a formatter that outputs plain latin1 text. All character
|
|
attributes (bold/italic/underline) are ignored. Formatting of HTML tables and
|
|
forms is not implemented.
|
|
|
|
HTML::FormatText is built on L<HTML::Formatter> and documentation for that
|
|
module applies to this - especially L<HTML::Formatter/new>,
|
|
L<HTML::Formatter/format_file> and L<HTML::Formatter/format_string>.
|
|
|
|
You might specify the following parameters when constructing the formatter:
|
|
|
|
=over 4
|
|
|
|
=item I<leftmargin> (alias I<lm>)
|
|
|
|
The column of the left margin. The default is 3.
|
|
|
|
=item I<rightmargin> (alias I<rm>)
|
|
|
|
The column of the right margin. The default is 72.
|
|
|
|
=back
|
|
|
|
=head1 SEE ALSO
|
|
|
|
L<HTML::Formatter>
|
|
|
|
=head1 INSTALLATION
|
|
|
|
See perlmodinstall for information and options on installing Perl modules.
|
|
|
|
=head1 BUGS AND LIMITATIONS
|
|
|
|
You can make new bug reports, and view existing ones, through the
|
|
web interface at L<http://rt.cpan.org/Public/Dist/Display.html?Name=HTML-Format>.
|
|
|
|
=head1 AVAILABILITY
|
|
|
|
The project homepage is L<https://metacpan.org/release/HTML-Format>.
|
|
|
|
The latest version of this module is available from the Comprehensive Perl
|
|
Archive Network (CPAN). Visit L<http://www.perl.com/CPAN/> to find a CPAN
|
|
site near you, or see L<https://metacpan.org/module/HTML::Format/>.
|
|
|
|
=head1 AUTHORS
|
|
|
|
=over 4
|
|
|
|
=item *
|
|
|
|
Nigel Metheringham <nigelm@cpan.org>
|
|
|
|
=item *
|
|
|
|
Sean M Burke <sburke@cpan.org>
|
|
|
|
=item *
|
|
|
|
Gisle Aas <gisle@ActiveState.com>
|
|
|
|
=back
|
|
|
|
=head1 COPYRIGHT AND LICENSE
|
|
|
|
This software is copyright (c) 2015 by Nigel Metheringham, 2002-2005 Sean M Burke, 1999-2002 Gisle Aas.
|
|
|
|
This is free software; you can redistribute it and/or modify it under
|
|
the same terms as the Perl 5 programming language system itself.
|
|
|
|
=cut
|