package MIME::Lite::HTML;
# module MIME::Lite::HTML : Provide routine to transform a HTML page in
# a MIME::Lite mail
# Copyright 2001/2011 A.Barbet alian@cpan.org. All rights reserved.
# $Log: HTML.pm,v $
# Revision 1.24 2011/10/07 11:27:42 alian
#
# Revision 1.24 2011/10/07 11:27:42 alian
# - Fix rt#67695 Add feature: "ExternImages" parameter to constructor (tbriggs)
# - Fix rt#68303 Outdated COPYING file
# - Fix rt#52907 CSS (and likely other) links match double-quote only
# - Fix rt#41447 Unable to call replace function
# - Fix rt#40164 Removing script code often fails
# - Fix bug when HTTP result is gzip format (use decoded_content, tks to E.Bataille
#
# Revision 1.23 2008/10/14 11:27:42 alian
# - Fix rt#36006: cid has no effect on background images
# - Fix rt#36005: include_javascript does not remove closing tag ""
# - Fix rt#29033: eliminate nested subs
# Revision 1.22 2006/09/06 14:46:42 alian
# - Fix rt#19656: unknown URI schemes cause rewrite to fail
# - Fix rt#17385: make test semi-panics
# - Fix rt#7841: Text-Only Encoding Ignored
# - Fix rt#21339: no license or copyright information provided
# - Fix rt#19655: include_css is far too aggressive
#
# Revision 1.21 2004/04/15 22:59:33 alian
# fix for 1.20 and bad ref for tests
#
# Revision 1.20 2004/04/14 21:26:51 alian
# - fix error on last version
#
# Revision 1.19 2004/03/16 15:18:57 alian
# - Add Url param in new for direct call of parse & send
# - Correct a problem in parsing of html elem background
# - Re-indent some methods
#
# Revision 1.18 2003/08/08 09:37:42 alian
# Fix test case and cid method
#
# Revision 1.17 2003/08/07 16:55:08 alian
# - Fix test case (hostname)
# - Update POD documentation
#
# Revision 1.16 2003/08/07 00:07:57 alian
# - Use pack for include type == cid: RFC says no '/'.
# Tks to Cláudio Valente for report.
# - Add a __END__ statement before POD documentation.
#
# Revision 1.15 2002/10/19 17:54:32 alian
# - Correct bug with relative anchor '/'. Tks to Keith D. Zimmerman for
# report.
#
# See Changes files for older changes
use LWP::UserAgent;
use HTML::LinkExtor;
use URI::URL;
use MIME::Lite;
use strict;
use vars qw($VERSION @ISA @EXPORT @EXPORT_OK);
require Exporter;
@ISA = qw(Exporter);
@EXPORT = qw();
$VERSION = ('$Revision: 1.24 $ ' =~ /(\d+\.\d+)/)[0];
my $LOGINDETAILS;
#------------------------------------------------------------------------------
# redefine get_basic_credentials
#------------------------------------------------------------------------------
{
package RequestAgent;
use vars qw(@ISA);
@ISA = qw(LWP::UserAgent);
sub new {
my $self = LWP::UserAgent::new(@_);
$self;
}
sub get_basic_credentials {
my($self, $realm, $uri) = @_;
# Use parameter of MIME-Lite-HTML, key LoginDetails
if (defined $LOGINDETAILS) { return split(':', $LOGINDETAILS, 2); }
# Ask user on STDIN
elsif (-t) {
my $netloc = $uri->host_port;
print "Enter username for $realm at $netloc: ";
my $user = ;
chomp($user);
# 403 if no user given
return (undef, undef) unless length $user;
print "Password: ";
system("stty -echo");
my $password = ;
system("stty echo");
print "\n"; # because we disabled echo
chomp($password);
return ($user, $password);
}
# Damm we got 403 with CGI (use param LoginDetails) ...
else { return (undef, undef) }
}
}
#------------------------------------------------------------------------------
# new
#------------------------------------------------------------------------------
sub new {
my $class = shift;
my $self = {};
bless $self, $class;
my %param = @_;
# Agent name
$self->{_AGENT} = new RequestAgent;
$self->{_AGENT}->agent("MIME-Lite-HTML $VERSION");
$self->{_AGENT}->from('mime-lite-html@alianwebserver.com' );
# remove javascript code or no ?
if ($param{'remove_jscript'}) {
$self->{_remove_jscript} = 1;
} else { $self->{_remove_jscript} = 0; }
# Set debug level
if ($param{'Debug'}) {
$self->{_DEBUG} = 1;
delete $param{'Debug'};
}
# Set Login information
if ($param{'LoginDetails'}) {
$LOGINDETAILS = $param{'LoginDetails'};
delete $param{'LoginDetails'};
}
# Set type of include to do
if ($param{'IncludeType'}) {
die "IncludeType must be in 'extern', 'cid' or 'location'\n" if
( ($param{'IncludeType'} ne 'extern') and
($param{'IncludeType'} ne 'cid') and
($param{'IncludeType'} ne 'location'));
$self->{_include} = $param{'IncludeType'};
delete $param{'IncludeType'};
} # Defaut type: use a Content-Location field
else {$self->{_include}='location';}
# Get regexps for images that should be external
if (defined $param{'ExternImages'}) {
$self->{_externimages} = $param{'ExternImages'};
}
## Added by Michalis@linuxmail.org to manipulate non-us mails
if ($param{'TextCharset'}) {
$self->{_textcharset}=$param{'TextCharset'};
delete $param{'TextCharset'};
} else { $self->{_textcharset}='iso-8859-1'; }
if ($param{'HTMLCharset'}) {
$self->{_htmlcharset}=$param{'HTMLCharset'};
delete $param{'HTMLCharset'};
} else { $self->{_htmlcharset}='iso-8859-1'; }
if ($param{'TextEncoding'}) {
$self->{_textencoding}=$param{'TextEncoding'};
delete $param{'TextEncoding'};
} else { $self->{_textencoding}='7bit'; }
if ($param{'HTMLEncoding'}) {
$self->{_htmlencoding}=$param{'HTMLEncoding'};
delete $param{'HTMLEncoding'};
} else { $self->{_htmlencoding}='quoted-printable'; }
## End. Default values remain as they were initially set.
## No need to change existing scripts if you send US-ASCII.
## If you DON't send us-ascii, you wouldn't be able to use
## MIME::Lite::HTML anyway :-)
# Set proxy to use to get file
if ($param{'Proxy'}) {
$self->{_AGENT}->proxy('http',$param{'Proxy'}) ;
print "Set proxy for http : ", $param{'Proxy'},"\n"
if ($self->{_DEBUG});
delete $param{'Proxy'};
}
# Set hash to use with template
if ($param{'HashTemplate'}) {
$param{'HashTemplate'} = ref($param{'HashTemplate'}) eq "HASH"
? $param{'HashTemplate'} : %{$param{'HashTemplate'}};
$self->{_HASH_TEMPLATE}= $param{'HashTemplate'};
delete $param{'HashTemplate'};
}
# Ok I hope I known what I do ;-)
MIME::Lite->quiet(1);
# direct call of new parse & send
my $url;
if ($param{'Url'}) {
$url = $param{'Url'};
delete $param{'Url'};
}
$self->{_param} = \%param;
if ($url) {
my $m = $self->parse($url);
$m->send;
}
return $self;
}
#------------------------------------------------------------------------------
# absUrl
#------------------------------------------------------------------------------
sub absUrl($$) {
# rt 19656 : unknown URI schemes cause rewrite to fail
my $rep = eval { URI::WithBase->new($_[0], $_[1])->abs; };
return ($rep ? $rep : $_[0]);
}
# Replace in HTML link with image with cid:key
sub pattern_image_cid {
my $sel = shift;
return 'cid(absUrl($_[1],$_[2])).'"';
}
# Replace relative url for image with absolute
sub pattern_image {
return '{_DEBUG};
my $req = new HTTP::Request('GET' => $url_page);
my $res = $self->{_AGENT}->request($req);
if (!$res->is_success)
{$self->set_err("Can't fetch $url_page (".$res->message.")");}
else {$gabarit = $res->content;}
$racinePage=$url1 || $res->base;
}
else {$gabarit=$url_page;$racinePage=$url1;}
# Get content of $url_txt with LWP if needed
if ($url_txt)
{
if ($url_txt=~/^(https?|ftp|file|nntp):\/\//)
{
print "Get ", $url_txt,"\n" if $self->{_DEBUG};
my $req2 = new HTTP::Request('GET' => $url_txt);
my $res3 = $self->{_AGENT}->request($req2);
if (!$res3->is_success)
{$self->set_err("Can't fetch $url_txt (".$res3->message.")");}
else {$gabarit_txt = $res3->content;}
}
else {$gabarit_txt=$url_txt;}
}
goto BUILD_MESSAGE unless $gabarit;
# Get all multimedia part (img, flash) for later create a MIME part
# for each of them
my $analyseur = HTML::LinkExtor->new;
$analyseur->parse($gabarit);
my @l = $analyseur->links;
# Include external CSS files
$gabarit = $self->include_css($gabarit,$racinePage);
# Include external Javascript files
$gabarit = $self->include_javascript($gabarit,$racinePage);
# Include form images
($gabarit,@mail) = $self->input_image($gabarit,$racinePage);
# Change target action for form
$gabarit = $self->link_form($gabarit,$racinePage);
# Scan each part found by linkExtor
my (%images_read,%url_remplace);
foreach my $url (@l) {
my $urlAbs = absUrl($$url[2],$racinePage);
chomp $urlAbs; # Sometime a strange cr/lf occur
# Replace relative href found to absolute one
if ( ($$url[0] eq 'a') && ($$url[1] eq 'href') && ($$url[2]) &&
(($$url[2]!~m!^http://!) && # un lien non absolu
($$url[2]!~m!^mailto:!) && # pas les mailto
($$url[2]!~m!^\#!)) && # ni les ancres
(!$url_remplace{$urlAbs}) ) # ni les urls deja remplacees
{
$gabarit=~s/\s href \s* = \s* [\"']? \Q$$url[2]\E ([\"'>])
/pattern_href($urlAbs,"href",$1)/giemx;
print "Replace ",$$url[2]," with ",$urlAbs,"\n"
if ($self->{_DEBUG});
$url_remplace{$urlAbs}=1;
}
# For frame & iframe
elsif ( (lc($$url[0] eq 'iframe') || lc($$url[0] eq 'frame')) &&
(lc($$url[1]) eq 'src') && ($$url[2]) )
{
$gabarit=~s/\s src \s* = \s* [\"']? \Q$$url[2]\E ([\"'>])
/pattern_href($urlAbs,"src",$1)/giemx;
print "Replace ",$$url[2]," with ",$urlAbs,"\n"
if ($self->{_DEBUG});
$url_remplace{$urlAbs}=1;
}
# For background images
elsif ((lc($$url[1]) eq 'background') && ($$url[2])) {
# Replace relative url with absolute
my $v = ($self->{_include} eq 'cid') ?
"cid:".$self->cid($urlAbs) : $urlAbs;
$gabarit=~s/background \s* = \s* [\"']? \Q$$url[2]\E ([\"'>])
/pattern_href($v,"background",$1)/giemx;
# Exit with extern configuration, don't include image
# else add part to mail
if (($self->{_include} ne 'extern')&&(!$images_read{$urlAbs})
and not $self->_matches_extern_images( $urlAbs ) )
{
$images_read{$urlAbs} = 1;
push(@mail, $self->create_image_part($urlAbs));
}
}
# For flash part (embed)
elsif (lc($$url[0]) eq 'embed' && $$url[4])
{
# rebuild $urlAbs
$urlAbs = absUrl($$url[4],$racinePage);
# Replace relative url with absolute
my $v = ($self->{_include} eq 'cid') ?
"cid:$urlAbs" : $urlAbs;
$gabarit=~s/src \s = \s [\"'] \Q$$url[4]\E ([\"'>])
/pattern_href($v,"src",$1)/giemx;
# Exit with extern configuration, don't include image
if (($self->{_include} ne 'extern')&&(!$images_read{$urlAbs})
and not $self->_matches_extern_images( $urlAbs ) )
{
$images_read{$urlAbs}=1;
push(@mail, $self->create_image_part($urlAbs));
}
}
# For flash part (object)
# Need to add "param" to Tagset.pm in the linkElements definition:
# 'param' => ['name', 'value'],
# Tks to tosh@c4.ca for that
elsif (lc($$url[0]) eq 'param' && lc($$url[2]) eq 'movie'
&& $$url[4]) {
# rebuild $urlAbs
$urlAbs = absUrl($$url[4],$racinePage);
# Replace relative url with absolute
my $v = ($self->{_include} eq 'cid') ?
"cid:".$self->cid($urlAbs) : $urlAbs;
$gabarit=~s/value \s* = \s* [\"'] \Q$$url[4]\E ([\"'>])
/pattern_href($v,"value",$1)/giemx;
# Exit with extern configuration, don't include image
if (($self->{_include} ne 'extern')&&(!$images_read{$urlAbs})
and not $self->_matches_extern_images($urlAbs))
{
$images_read{$urlAbs}=1;
push(@mail, $self->create_image_part($urlAbs));
}
}
# For new images create part
# Exit with extern configuration, don't include image
elsif ( ($self->{_include} ne 'extern') &&
( not $self->_matches_extern_images( $urlAbs ) ) &&
((lc($$url[0]) eq 'img') || (lc($$url[0]) eq 'src')) &&
(!$images_read{$urlAbs})) {
$images_read{$urlAbs}=1;
push(@mail, $self->create_image_part($urlAbs));
}
}
# If cid choice, put a cid + absolute url on each link image
if ($self->{_include} eq 'cid')
{$gabarit=~s/]*) src\s*=\s*(["']?) ([^"'> ]* )(["']?)
/pattern_image_cid($self,$1,$3,$racinePage)/iegx;}
# Else just make a absolute url
else {$gabarit=~s/]*) src\s*=\s*(["']?)([^"'> ]*) (["']?)
/pattern_image($1,$3,$racinePage)/iegx;}
BUILD_MESSAGE:
# Substitue value in template if needed
if (scalar keys %{$self->{_HASH_TEMPLATE}}!=0)
{
$gabarit=$self->fill_template($gabarit,$self->{_HASH_TEMPLATE})
if ($gabarit);
$gabarit_txt=$self->fill_template($gabarit_txt,
$self->{_HASH_TEMPLATE});
}
# Create MIME-Lite object
$self->build_mime_object($gabarit, $gabarit_txt || undef, \@mail);
return $self->{_MAIL};
}
#------------------------------------------------------------------------------
# size
#------------------------------------------------------------------------------
sub size {
my ($self)=shift;
return length($self->{_MAIL}->as_string);
}
#------------------------------------------------------------------------------
# _matches_extern_images
#
# For a given image, does it match any of the regexps in $self->{_externimages} ?
#------------------------------------------------------------------------------
sub _matches_extern_images {
my ( $self, $image ) = @_;
my $regexps = $self->{_externimages} || [ ];
foreach my $regexp ( @$regexps ) {
if ( $image =~ /$regexp/ ) {
return 1;
}
}
return 0;
}
#------------------------------------------------------------------------------
# build_mime_object
#------------------------------------------------------------------------------
sub build_mime_object {
my ($self,$html,$txt,$ref_mail)=@_;
my ($txt_part, $part,$mail);
# Create part for HTML if needed
if ($html) {
my $ref = ($txt || @$ref_mail) ? {} : $self->{_param};
$part = new MIME::Lite(%$ref,
'Type' => 'TEXT',
'Encoding' => $self->{_htmlencoding},
'Data' => $html);
$part->attr("content-type"=> "text/html; charset=".$self->{_htmlcharset});
# Remove some header for Eudora client in HTML and related part
# $part->replace("MIME-Version" => ""); #MFF-380 - dodrzeni standardu RFC-2045
$part->replace('X-Mailer' =>"");
$part->replace('Content-Disposition' =>"");
# only html, no images & no txt
$mail = $part unless ($txt || @$ref_mail);
}
# Create part for text if needed
if ($txt) {
my $ref = ($html ? {} : $self->{_param} );
$txt_part = new MIME::Lite (%$ref,
'Type' => 'TEXT',
'Data' => $txt,
'Encoding' => $self->{_textencoding});
$txt_part->attr("content-type" =>
"text/plain; charset=".$self->{_textcharset});
# Remove some header for Eudora client
# $txt_part->replace("MIME-Version" => ""); #MFF-380 - dodrzeni standardu RFC-2045
$txt_part->replace("X-Mailer" => "");
$txt_part->replace("Content-Disposition" => "");
# only text, no html
$mail = $txt_part unless $html;
}
# If images and html and no text, multipart/related
if (@$ref_mail and !$txt) {
my $ref=$self->{_param};
$$ref{'Type'} = "multipart/related";
$mail = new MIME::Lite (%$ref);
# Attach HTML part to related part
$mail->attach($part);
# Attach each image to related part
foreach (@$ref_mail) {$mail->attach($_);} # Attach list of part
$mail->replace("Content-Disposition" => "");
}
# Else if html and text and no images, multipart/alternative
elsif ($txt and !@$ref_mail) {
my $ref=$self->{_param};
$$ref{'Type'} = "multipart/alternative";
$mail = new MIME::Lite (%$ref);
$mail->attach($txt_part); # Attach text part
$mail->attach($part); # Attach HTML part
}
# Else (html, txt and images) mutilpart/alternative
elsif ($txt && @$ref_mail) {
my $ref=$self->{_param};
$$ref{'Type'} = "multipart/alternative";
$mail = new MIME::Lite (%$ref);
# Create related part
my $rel = new MIME::Lite ('Type'=>'multipart/related');
$rel->replace("Content-transfer-encoding" => "");
$rel->replace("MIME-Version" => "");
$rel->replace("X-Mailer" => "");
# Attach text part to alternative part
$mail->attach($txt_part);
# Attach HTML part to related part
$rel->attach($part);
# Attach each image to related part
foreach (@$ref_mail) {$rel->attach($_);}
# Attach related part to alternative part
$mail->attach($rel);
}
$mail->replace('X-Mailer' => "MIME::Lite::HTML $VERSION");
$self->{_MAIL} = $mail;
}
#------------------------------------------------------------------------------
# include_css
#------------------------------------------------------------------------------
sub pattern_css {
my ($self,$url,$milieu,$fin,$root)=@_;
# if not stylesheet - rt19655
if ($milieu!~/stylesheet/i && $fin!~/stylesheet/i) {
return "";
}
# Don't store tag. Tks to doggy@miniasp.com
if ( $fin =~ m/shortcut/i || $milieu =~ m/shortcut/i )
{ return ""; }
# Complete url
my $ur = URI::URL->new($url, $root)->abs;
print "Include CSS file $ur\n" if $self->{_DEBUG};
my $res2 = $self->{_AGENT}->request(new HTTP::Request('GET' => $ur));
print "Ok file downloaded\n" if $self->{_DEBUG};
return '\n";
}
sub include_css(\%$$) {
my ($self,$gabarit,$root)=@_;
$gabarit=~s/]*?)
href\s*=\s*["']?([^\"\' ]*)["']?([^>]*)>
/$self->pattern_css($2,$1,$3,$root)/iegmx;
print "Done CSS\n" if ($self->{_DEBUG});
return $gabarit;
}
#------------------------------------------------------------------------------
# include_javascript
#------------------------------------------------------------------------------
sub pattern_js {
my ($self,$url,$milieu,$fin,$root)=@_;
my $ur = URI::URL->new($url, $root)->abs;
print "Include Javascript file $ur\n" if $self->{_DEBUG};
my $res2 = $self->{_AGENT}->request(new HTTP::Request('GET' => $ur));
my $content = $res2->decoded_content;
print "Ok file downloaded\n" if $self->{_DEBUG};
return ($self->{_remove_jscript} ? ' ' : "\n"."\n".
'\n");
}
sub include_javascript(\%$$) {
my ($self,$gabarit,$root)=@_;
$gabarit=~s/