Initial commit.
This commit is contained in:
564
mail/spamassassin/DecodeShortURLs.pm
Normal file
564
mail/spamassassin/DecodeShortURLs.pm
Normal file
@@ -0,0 +1,564 @@
|
||||
# <@LICENSE>
|
||||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# contributor license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright ownership.
|
||||
# The ASF licenses this file to you under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at:
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# </@LICENSE>
|
||||
|
||||
# Author: Steve Freegard <steve.freegard@fsl.com>
|
||||
|
||||
=head1 NAME
|
||||
|
||||
DecodeShortURLs - Expand shortened URLs
|
||||
|
||||
=head1 SYNOPSIS
|
||||
|
||||
loadplugin Mail::SpamAssassin::Plugin::DecodeShortURLs
|
||||
|
||||
url_shortener bit.ly
|
||||
url_shortener go.to
|
||||
...
|
||||
|
||||
=head1 DESCRIPTION
|
||||
|
||||
This plugin looks for URLs shortened by a list of URL shortening services and
|
||||
upon finding a matching URL will connect using to the shortening service and
|
||||
do an HTTP HEAD lookup and retrieve the location header which points to the
|
||||
actual shortened URL, it then adds this URL to the list of URIs extracted by
|
||||
SpamAssassin which can then be accessed by other plug-ins, such as URIDNSBL.
|
||||
|
||||
This plugin also sets the rule HAS_SHORT_URL if any matching short URLs are
|
||||
found.
|
||||
|
||||
Regular 'uri' rules can be used to detect and score links disabled by the
|
||||
shortening service for abuse and URL_BITLY_BLOCKED is supplied as an example.
|
||||
It should be safe to score this rule highly on a match as experience shows
|
||||
that bit.ly only blocks access to a URL if it has seen consistent abuse and
|
||||
problem reports.
|
||||
|
||||
As of version 0.3 this plug-in will follow 'chained' shorteners e.g.
|
||||
|
||||
|
||||
short URL -> short URL -> short URL -> real URL
|
||||
|
||||
|
||||
If this form of chaining is found, then the rule 'SHORT_URL_CHAINED' will be
|
||||
fired. If a loop is detected then 'SHORT_URL_LOOP' will be fired.
|
||||
This plug-in limits the number of chained shorteners to a maximim of 10 at
|
||||
which point it will fire the rule 'SHORT_URL_MAXCHAIN' and go no further.
|
||||
|
||||
If a shortener returns a '404 Not Found' result for the short URL then the
|
||||
rule 'SHORT_URL_404' will be fired.
|
||||
|
||||
=head1 NOTES
|
||||
|
||||
This plugin runs the parsed_metadata hook with a priority of -1 so that
|
||||
it may modify the parsed URI list prior to the URIDNSBL plugin which
|
||||
runs as priority 0.
|
||||
|
||||
Currently the plugin queries a maximum of 10 distinct shortened URLs with
|
||||
a maximum timeout of 5 seconds per lookup. It does not recurse and follow
|
||||
'chained' shortening as the author has no examples of this happening.
|
||||
|
||||
=head1 ACKNOWLEDGEMENTS
|
||||
|
||||
A lot of this plugin has been hacked together by using other plugins as
|
||||
examples. The author would particularly like to tip his hat to Karsten
|
||||
Bräckelmann for the _add_uri_detail_list() function that he stole from
|
||||
GUDO.pm for which this plugin would not be possible due to the SpamAssassin
|
||||
API making no provision for adding to the base list of extracted URIs and
|
||||
the author not knowing enough about Perl to be able to achieve this without
|
||||
a good example from someone that does ;-)
|
||||
|
||||
=cut
|
||||
|
||||
package Mail::SpamAssassin::Plugin::DecodeShortURLs;
|
||||
|
||||
my $VERSION = 0.6;
|
||||
|
||||
use Mail::SpamAssassin::Plugin;
|
||||
use strict;
|
||||
use warnings;
|
||||
|
||||
use vars qw(@ISA);
|
||||
@ISA = qw(Mail::SpamAssassin::Plugin);
|
||||
|
||||
use constant HAS_LWP_USERAGENT => eval { local $SIG{'__DIE__'}; require LWP::UserAgent; };
|
||||
use constant HAS_SQLITE => eval { local $SIG{'__DIE__'}; require DBD::SQLite; };
|
||||
use Fcntl qw(:flock SEEK_END);
|
||||
use Sys::Syslog qw(:DEFAULT setlogsock);
|
||||
|
||||
|
||||
sub dbg {
|
||||
my $msg = shift;
|
||||
return Mail::SpamAssassin::Logger::dbg("DecodeShortURLs: $msg");
|
||||
}
|
||||
|
||||
sub new {
|
||||
my $class = shift;
|
||||
my $mailsaobject = shift;
|
||||
|
||||
$class = ref($class) || $class;
|
||||
my $self = $class->SUPER::new($mailsaobject);
|
||||
bless ($self, $class);
|
||||
|
||||
if ($mailsaobject->{local_tests_only} || !HAS_LWP_USERAGENT) {
|
||||
$self->{disabled} = 1;
|
||||
} else {
|
||||
$self->{disabled} = 0;
|
||||
}
|
||||
|
||||
unless ($self->{disabled}) {
|
||||
$self->{ua} = new LWP::UserAgent;
|
||||
$self->{ua}->{max_redirect} = 0;
|
||||
$self->{ua}->{timeout} = 5;
|
||||
$self->{ua}->env_proxy;
|
||||
$self->{logging} = 0;
|
||||
$self->{caching} = 0;
|
||||
$self->{syslog} = 0;
|
||||
}
|
||||
|
||||
$self->set_config($mailsaobject->{conf});
|
||||
$self->register_method_priority ('parsed_metadata', -1);
|
||||
$self->register_eval_rule('short_url_tests');
|
||||
|
||||
return $self;
|
||||
}
|
||||
|
||||
sub set_config {
|
||||
my($self, $conf) = @_;
|
||||
my @cmds = ();
|
||||
|
||||
push (@cmds, {
|
||||
setting => 'url_shortener',
|
||||
default => {},
|
||||
code => sub {
|
||||
my ($self, $key, $value, $line) = @_;
|
||||
if ($value =~ /^$/) {
|
||||
return $Mail::SpamAssassin::Conf::MISSING_REQUIRED_VALUE;
|
||||
}
|
||||
foreach my $domain (split(/\s+/, $value)) {
|
||||
$self->{url_shorteners}->{lc $domain} = 1;
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
=cut
|
||||
|
||||
=head1 PRIVILEGED SETTINGS
|
||||
|
||||
=over 4
|
||||
|
||||
=item url_shortener_log (default: none)
|
||||
|
||||
A path to a log file to be written to. The file will be created if it does
|
||||
not already exist and must be writable by the user running spamassassin.
|
||||
|
||||
For each short URL found the following will be written to the log file:
|
||||
[unix_epoch_time] <short url> => <decoded url>
|
||||
|
||||
=cut
|
||||
|
||||
push (@cmds, {
|
||||
setting => 'url_shortener_log',
|
||||
default => '',
|
||||
is_priv => 1,
|
||||
type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING
|
||||
});
|
||||
|
||||
=item url_shortener_cache (default: none)
|
||||
|
||||
The full path to a database file to write cache entries to. The database will
|
||||
be created automatically if is does not already exist but the supplied path
|
||||
and file must be read/writable by the user running spamassassin or spamd.
|
||||
|
||||
|
||||
NOTE: you will need the DBD::SQLite module installed to use this feature.
|
||||
|
||||
Example:
|
||||
|
||||
url_shortener_cache /tmp/DecodeShortURLs.sq3
|
||||
|
||||
=cut
|
||||
|
||||
|
||||
push (@cmds, {
|
||||
setting => 'url_shortener_cache',
|
||||
default => '',
|
||||
is_priv => 1,
|
||||
type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING
|
||||
});
|
||||
|
||||
=item url_shortener_cache_ttl (default: 86400)
|
||||
|
||||
The length of time a cache entry will be valid for in seconds.
|
||||
Default is 86400 (1 day).
|
||||
|
||||
|
||||
NOTE: you will also need to run the following via cron to actually remove the
|
||||
records from the database:
|
||||
|
||||
echo "DELETE FROM short_url_cache WHERE modified < strftime('%s',now) - <ttl>; | sqlite3 /path/to/database"
|
||||
|
||||
|
||||
NOTE: replace <ttl> above with the same value you use for this option
|
||||
|
||||
=cut
|
||||
|
||||
push (@cmds, {
|
||||
setting => 'url_shortener_cache_ttl',
|
||||
is_admin => 1,
|
||||
default => 86400,
|
||||
type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
|
||||
});
|
||||
|
||||
=item url_shortener_syslog (default: 0 (off))
|
||||
|
||||
If this option is enabled (set to 1), then short URLs and the decoded URLs will be logged to syslog (mail.info).
|
||||
|
||||
=cut
|
||||
|
||||
|
||||
push (@cmds, {
|
||||
setting => 'url_shortener_syslog',
|
||||
is_admin => 1,
|
||||
default => 0,
|
||||
type => $Mail::SpamAssassin::Conf::CONF_TYPE_BOOL
|
||||
});
|
||||
|
||||
|
||||
$conf->{parser}->register_commands(\@cmds);
|
||||
}
|
||||
|
||||
sub parsed_metadata {
|
||||
my ($self, $opts) = @_;
|
||||
my $pms = $opts->{permsgstatus};
|
||||
my $msg = $opts->{msg};
|
||||
|
||||
return if $self->{disabled};
|
||||
|
||||
dbg ('warn: get_uri_detail_list() has been called already')
|
||||
if exists $pms->{uri_detail_list};
|
||||
|
||||
# don't keep dereferencing these
|
||||
$self->{url_shorteners} = $pms->{main}->{conf}->{url_shorteners};
|
||||
($self->{url_shortener_log}) = ($pms->{main}->{conf}->{url_shortener_log} =~ /^(.*)$/g);
|
||||
($self->{url_shortener_cache}) = ($pms->{main}->{conf}->{url_shortener_cache} =~ /^(.*)$/g);
|
||||
$self->{url_shortener_cache_ttl} = $pms->{main}->{conf}->{url_shortener_cache_ttl};
|
||||
$self->{url_shortener_syslog} = $pms->{main}->{conf}->{url_shortener_syslog};
|
||||
|
||||
# Sort short URLs into hash to de-dup them
|
||||
my %short_urls;
|
||||
my $uris = $pms->get_uri_detail_list();
|
||||
while (my($uri, $info) = each %{$uris}) {
|
||||
next unless ($info->{domains});
|
||||
foreach ( keys %{ $info->{domains} } ) {
|
||||
if (exists $self->{url_shorteners}->{lc $_}) {
|
||||
# NOTE: $info->{domains} appears to contain all the domains parsed
|
||||
# from the single input URI with no way to work out what the base
|
||||
# domain is. So to prevent someone from stuffing the URI with a
|
||||
# shortener to force this plug-in to follow a link that *isn't* on
|
||||
# the list of shorteners; we enforce that the shortener must be the
|
||||
# base URI and that a path must be present.
|
||||
if ($uri !~ /^http:\/\/(?:www\.)?$_\/.+$/) {
|
||||
dbg("Discarding URI: $uri");
|
||||
next;
|
||||
}
|
||||
$short_urls{$uri} = 1;
|
||||
next;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# Make sure we have some work to do
|
||||
# Before we open any log files etc.
|
||||
my $count = scalar keys %short_urls;
|
||||
return undef unless $count gt 0;
|
||||
|
||||
# Initialise logging if enabled
|
||||
if ($self->{url_shortener_log}) {
|
||||
eval {
|
||||
local $SIG{'__DIE__'};
|
||||
open($self->{logfh}, '>>'.$self->{url_shortener_log}) or die $!;
|
||||
};
|
||||
if ($@) {
|
||||
dbg("warn: $@");
|
||||
} else {
|
||||
$self->{logging} = 1;
|
||||
}
|
||||
}
|
||||
|
||||
# Initialise syslog if enabled
|
||||
if ($self->{url_shortener_syslog}) {
|
||||
eval {
|
||||
local $SIG{'__DIE__'};
|
||||
openlog('DecodeShortURLs','ndelay,pid','mail');
|
||||
};
|
||||
if ($@) {
|
||||
dbg("warn: $@");
|
||||
} else {
|
||||
$self->{syslog} = 1;
|
||||
}
|
||||
}
|
||||
|
||||
# Initialise cache if enabled
|
||||
if ($self->{url_shortener_cache} && HAS_SQLITE) {
|
||||
eval {
|
||||
local $SIG{'__DIE__'};
|
||||
$self->{dbh} = DBI->connect_cached("dbi:SQLite:dbname=".$self->{url_shortener_cache},"","", {RaiseError => 1, PrintError => 0, InactiveDestroy => 1}) or die $!;
|
||||
};
|
||||
if ($@) {
|
||||
dbg("warn: $@");
|
||||
} else {
|
||||
$self->{caching} = 1;
|
||||
|
||||
# Create database if needed
|
||||
eval {
|
||||
local $SIG{'__DIE__'};
|
||||
$self->{dbh}->do("
|
||||
CREATE TABLE IF NOT EXISTS short_url_cache (
|
||||
short_url TEXT PRIMARY KEY NOT NULL,
|
||||
decoded_url TEXT NOT NULL,
|
||||
hits INTEGER NOT NULL DEFAULT 1,
|
||||
created INTEGER NOT NULL DEFAULT (strftime('%s','now')),
|
||||
modified INTEGER NOT NULL DEFAULT (strftime('%s','now'))
|
||||
)
|
||||
");
|
||||
$self->{dbh}->do("
|
||||
CREATE INDEX IF NOT EXISTS short_url_by_modified
|
||||
ON short_url_cache(short_url, modified)
|
||||
");
|
||||
$self->{dbh}->do("
|
||||
CREATE INDEX IF NOT EXISTS short_url_modified
|
||||
ON short_url_cache(modified)
|
||||
");
|
||||
};
|
||||
if ($@) {
|
||||
dbg("warn: $@");
|
||||
$self->{caching} = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
my $max_short_urls = 10;
|
||||
foreach my $short_url (keys %short_urls) {
|
||||
next if ($max_short_urls le 0);
|
||||
my $location = $self->recursive_lookup($short_url, $pms);
|
||||
$max_short_urls--;
|
||||
}
|
||||
|
||||
# Close log
|
||||
eval {
|
||||
local $SIG{'__DIE__'};
|
||||
close($self->{logfh}) or die $!;
|
||||
} if $self->{logging};
|
||||
|
||||
# Close syslog
|
||||
eval {
|
||||
local $SIG{'__DIE__'};
|
||||
closelog() or die $!;
|
||||
} if $self->{syslog};
|
||||
|
||||
# Don't disconnect cached database handle
|
||||
# eval { $self->{dbh}->disconnect() or die $!; } if $self->{caching};
|
||||
}
|
||||
|
||||
sub recursive_lookup {
|
||||
my ($self, $short_url, $pms, %been_here) = @_;
|
||||
|
||||
my $count = scalar keys %been_here;
|
||||
dbg("Redirection count $count") if $count gt 0;
|
||||
if ($count ge 10) {
|
||||
dbg("Error: more than 10 shortener redirections");
|
||||
# Fire test
|
||||
$pms->got_hit('SHORT_URL_MAXCHAIN');
|
||||
return undef;
|
||||
}
|
||||
|
||||
my $location;
|
||||
|
||||
if ($self->{caching} && ($location = $self->cache_get($short_url))) {
|
||||
dbg("Found cached $short_url => $location");
|
||||
eval {
|
||||
local $SIG{'__DIE__'};
|
||||
$self->log_to_file("$short_url => $location")
|
||||
} if $self->{logging};
|
||||
syslog('info',"Found cached $short_url => $location") if $self->{syslog};
|
||||
} else {
|
||||
# Not cached; do lookup
|
||||
my $response = $self->{ua}->head($short_url);
|
||||
if (!$response->is_redirect) {
|
||||
dbg("Skipping URL as not redirect: $short_url = ".$response->status_line);
|
||||
$pms->got_hit('SHORT_URL_404') if($response->code == '404');
|
||||
return undef;
|
||||
}
|
||||
$location = $response->headers->{location};
|
||||
# Bail out if $short_url redirects to itself
|
||||
return undef if ($short_url eq $location);
|
||||
$self->cache_add($short_url, $location) if $self->{caching};
|
||||
dbg("Found $short_url => $location");
|
||||
eval {
|
||||
local $SIG{'__DIE__'};
|
||||
$self->log_to_file("$short_url => $location")
|
||||
} if $self->{logging};
|
||||
syslog('info',"Found $short_url => $location") if $self->{syslog};
|
||||
}
|
||||
|
||||
# At this point we have a new URL in $response
|
||||
$pms->got_hit('HAS_SHORT_URL');
|
||||
_add_uri_detail_list($pms, $location);
|
||||
|
||||
# Set chained here otherwise we might mark a disabled page or
|
||||
# redirect back to the same host as chaining incorrectly.
|
||||
$pms->got_hit('SHORT_URL_CHAINED') if ($count gt 0);
|
||||
|
||||
# Check if we are being redirected to a local page
|
||||
# Don't recurse in this case...
|
||||
if($location !~ /^https?:/) {
|
||||
my($host) = ($short_url =~ /^(https?:\/\/\S+)\//);
|
||||
$location = "$host/$location";
|
||||
dbg("Looks like a local redirection: $short_url => $location");
|
||||
_add_uri_detail_list($pms, $location);
|
||||
return $location;
|
||||
}
|
||||
|
||||
# Check for recursion
|
||||
if ((my ($domain) = ($location =~ /^https?:\/\/(\S+)\//))) {
|
||||
if (exists $been_here{$location}) {
|
||||
# Loop detected
|
||||
dbg("Error: loop detected");
|
||||
$pms->got_hit('SHORT_URL_LOOP');
|
||||
return $location;
|
||||
} else {
|
||||
if (exists $self->{url_shorteners}->{$domain}) {
|
||||
$been_here{$location} = 1;
|
||||
# Recurse...
|
||||
return $self->recursive_lookup($location, $pms, %been_here);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# No recursion; just return the final location...
|
||||
return $location;
|
||||
}
|
||||
|
||||
sub short_url_tests {
|
||||
# Set by parsed_metadata
|
||||
return 0;
|
||||
}
|
||||
|
||||
# Beware. Code copied from PerMsgStatus get_uri_detail_list().
|
||||
# Stolen from GUDO.pm
|
||||
sub _add_uri_detail_list {
|
||||
my ($pms, $uri) = @_;
|
||||
my $info;
|
||||
|
||||
# Cache of text parsed URIs, as previously used by get_uri_detail_list().
|
||||
push @{$pms->{parsed_uri_list}}, $uri;
|
||||
|
||||
$info->{types}->{parsed} = 1;
|
||||
|
||||
$info->{cleaned} =
|
||||
[Mail::SpamAssassin::Util::uri_list_canonify (undef, $uri)];
|
||||
|
||||
foreach (@{$info->{cleaned}}) {
|
||||
my $dom = Mail::SpamAssassin::Util::uri_to_domain($_);
|
||||
|
||||
if ($dom && !$info->{domains}->{$dom}) {
|
||||
$info->{domains}->{$dom} = 1;
|
||||
$pms->{uri_domain_count}++;
|
||||
}
|
||||
}
|
||||
|
||||
$pms->{uri_detail_list}->{$uri} = $info;
|
||||
|
||||
# And of course, copied code from PerMsgStatus get_uri_list(). *sigh*
|
||||
dbg ('warn: PMS::get_uri_list() appears to have been harvested'),
|
||||
push @{$pms->{uri_list}}, @{$info->{cleaned}}
|
||||
if exists $pms->{uri_list};
|
||||
}
|
||||
|
||||
sub log_to_file {
|
||||
my ($self, $msg) = @_;
|
||||
return undef if not $self->{logging};
|
||||
my $fh = $self->{logfh};
|
||||
eval {
|
||||
flock($fh, LOCK_EX) or die $!;
|
||||
seek($fh, 0, SEEK_END) or die $!;
|
||||
print $fh '['.time.'] '.$msg."\n";
|
||||
flock($fh, LOCK_UN) or die $!;
|
||||
};
|
||||
}
|
||||
|
||||
sub cache_add {
|
||||
my ($self, $short_url, $decoded_url) = @_;
|
||||
return undef if not $self->{caching};
|
||||
|
||||
eval {
|
||||
$self->{sth_insert} = $self->{dbh}->prepare_cached("
|
||||
INSERT INTO short_url_cache (short_url, decoded_url)
|
||||
VALUES (?,?)
|
||||
");
|
||||
};
|
||||
if ($@) {
|
||||
dbg("warn: $@");
|
||||
return undef;
|
||||
};
|
||||
|
||||
$self->{sth_insert}->execute($short_url, $decoded_url);
|
||||
return undef;
|
||||
}
|
||||
|
||||
sub cache_get {
|
||||
my ($self, $key) = @_;
|
||||
return undef if not $self->{caching};
|
||||
|
||||
eval {
|
||||
$self->{sth_select} = $self->{dbh}->prepare_cached("
|
||||
SELECT decoded_url FROM short_url_cache
|
||||
WHERE short_url = ? AND modified > (strftime('%s','now') - ?)
|
||||
");
|
||||
};
|
||||
if ($@) {
|
||||
dbg("warn: $@");
|
||||
return undef;
|
||||
}
|
||||
|
||||
eval {
|
||||
$self->{sth_update} = $self->{dbh}->prepare_cached("
|
||||
UPDATE short_url_cache
|
||||
SET modified=strftime('%s','now'), hits=hits+1
|
||||
WHERE short_url = ?
|
||||
");
|
||||
};
|
||||
if ($@) {
|
||||
dbg("warn: $@");
|
||||
return undef;
|
||||
}
|
||||
|
||||
$self->{sth_select}->execute($key, $self->{url_shortener_cache_ttl});
|
||||
my $row = $self->{sth_select}->fetchrow_array();
|
||||
if($row) {
|
||||
# Found cache entry; touch it to prevent expiry
|
||||
$self->{sth_update}->execute($key);
|
||||
$self->{sth_select}->finish();
|
||||
$self->{sth_update}->finish();
|
||||
return $row;
|
||||
}
|
||||
|
||||
$self->{sth_select}->finish();
|
||||
$self->{sth_update}->finish();
|
||||
return undef;
|
||||
}
|
||||
|
||||
1;
|
||||
Reference in New Issue
Block a user