Files
zira-etc/mail/spamassassin/DecodeShortURLs.pm
2021-05-24 22:18:33 +03:00

565 lines
16 KiB
Perl

# <@LICENSE>
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to you under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at:
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# </@LICENSE>
# Author: Steve Freegard <steve.freegard@fsl.com>
=head1 NAME
DecodeShortURLs - Expand shortened URLs
=head1 SYNOPSIS
loadplugin Mail::SpamAssassin::Plugin::DecodeShortURLs
url_shortener bit.ly
url_shortener go.to
...
=head1 DESCRIPTION
This plugin looks for URLs shortened by a list of URL shortening services and
upon finding a matching URL will connect using to the shortening service and
do an HTTP HEAD lookup and retrieve the location header which points to the
actual shortened URL, it then adds this URL to the list of URIs extracted by
SpamAssassin which can then be accessed by other plug-ins, such as URIDNSBL.
This plugin also sets the rule HAS_SHORT_URL if any matching short URLs are
found.
Regular 'uri' rules can be used to detect and score links disabled by the
shortening service for abuse and URL_BITLY_BLOCKED is supplied as an example.
It should be safe to score this rule highly on a match as experience shows
that bit.ly only blocks access to a URL if it has seen consistent abuse and
problem reports.
As of version 0.3 this plug-in will follow 'chained' shorteners e.g.
short URL -> short URL -> short URL -> real URL
If this form of chaining is found, then the rule 'SHORT_URL_CHAINED' will be
fired. If a loop is detected then 'SHORT_URL_LOOP' will be fired.
This plug-in limits the number of chained shorteners to a maximim of 10 at
which point it will fire the rule 'SHORT_URL_MAXCHAIN' and go no further.
If a shortener returns a '404 Not Found' result for the short URL then the
rule 'SHORT_URL_404' will be fired.
=head1 NOTES
This plugin runs the parsed_metadata hook with a priority of -1 so that
it may modify the parsed URI list prior to the URIDNSBL plugin which
runs as priority 0.
Currently the plugin queries a maximum of 10 distinct shortened URLs with
a maximum timeout of 5 seconds per lookup. It does not recurse and follow
'chained' shortening as the author has no examples of this happening.
=head1 ACKNOWLEDGEMENTS
A lot of this plugin has been hacked together by using other plugins as
examples. The author would particularly like to tip his hat to Karsten
Bräckelmann for the _add_uri_detail_list() function that he stole from
GUDO.pm for which this plugin would not be possible due to the SpamAssassin
API making no provision for adding to the base list of extracted URIs and
the author not knowing enough about Perl to be able to achieve this without
a good example from someone that does ;-)
=cut
package Mail::SpamAssassin::Plugin::DecodeShortURLs;
my $VERSION = 0.6;
use Mail::SpamAssassin::Plugin;
use strict;
use warnings;
use vars qw(@ISA);
@ISA = qw(Mail::SpamAssassin::Plugin);
use constant HAS_LWP_USERAGENT => eval { local $SIG{'__DIE__'}; require LWP::UserAgent; };
use constant HAS_SQLITE => eval { local $SIG{'__DIE__'}; require DBD::SQLite; };
use Fcntl qw(:flock SEEK_END);
use Sys::Syslog qw(:DEFAULT setlogsock);
sub dbg {
my $msg = shift;
return Mail::SpamAssassin::Logger::dbg("DecodeShortURLs: $msg");
}
sub new {
my $class = shift;
my $mailsaobject = shift;
$class = ref($class) || $class;
my $self = $class->SUPER::new($mailsaobject);
bless ($self, $class);
if ($mailsaobject->{local_tests_only} || !HAS_LWP_USERAGENT) {
$self->{disabled} = 1;
} else {
$self->{disabled} = 0;
}
unless ($self->{disabled}) {
$self->{ua} = new LWP::UserAgent;
$self->{ua}->{max_redirect} = 0;
$self->{ua}->{timeout} = 5;
$self->{ua}->env_proxy;
$self->{logging} = 0;
$self->{caching} = 0;
$self->{syslog} = 0;
}
$self->set_config($mailsaobject->{conf});
$self->register_method_priority ('parsed_metadata', -1);
$self->register_eval_rule('short_url_tests');
return $self;
}
sub set_config {
my($self, $conf) = @_;
my @cmds = ();
push (@cmds, {
setting => 'url_shortener',
default => {},
code => sub {
my ($self, $key, $value, $line) = @_;
if ($value =~ /^$/) {
return $Mail::SpamAssassin::Conf::MISSING_REQUIRED_VALUE;
}
foreach my $domain (split(/\s+/, $value)) {
$self->{url_shorteners}->{lc $domain} = 1;
}
}
});
=cut
=head1 PRIVILEGED SETTINGS
=over 4
=item url_shortener_log (default: none)
A path to a log file to be written to. The file will be created if it does
not already exist and must be writable by the user running spamassassin.
For each short URL found the following will be written to the log file:
[unix_epoch_time] <short url> => <decoded url>
=cut
push (@cmds, {
setting => 'url_shortener_log',
default => '',
is_priv => 1,
type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING
});
=item url_shortener_cache (default: none)
The full path to a database file to write cache entries to. The database will
be created automatically if is does not already exist but the supplied path
and file must be read/writable by the user running spamassassin or spamd.
NOTE: you will need the DBD::SQLite module installed to use this feature.
Example:
url_shortener_cache /tmp/DecodeShortURLs.sq3
=cut
push (@cmds, {
setting => 'url_shortener_cache',
default => '',
is_priv => 1,
type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING
});
=item url_shortener_cache_ttl (default: 86400)
The length of time a cache entry will be valid for in seconds.
Default is 86400 (1 day).
NOTE: you will also need to run the following via cron to actually remove the
records from the database:
echo "DELETE FROM short_url_cache WHERE modified < strftime('%s',now) - <ttl>; | sqlite3 /path/to/database"
NOTE: replace <ttl> above with the same value you use for this option
=cut
push (@cmds, {
setting => 'url_shortener_cache_ttl',
is_admin => 1,
default => 86400,
type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
});
=item url_shortener_syslog (default: 0 (off))
If this option is enabled (set to 1), then short URLs and the decoded URLs will be logged to syslog (mail.info).
=cut
push (@cmds, {
setting => 'url_shortener_syslog',
is_admin => 1,
default => 0,
type => $Mail::SpamAssassin::Conf::CONF_TYPE_BOOL
});
$conf->{parser}->register_commands(\@cmds);
}
sub parsed_metadata {
my ($self, $opts) = @_;
my $pms = $opts->{permsgstatus};
my $msg = $opts->{msg};
return if $self->{disabled};
dbg ('warn: get_uri_detail_list() has been called already')
if exists $pms->{uri_detail_list};
# don't keep dereferencing these
$self->{url_shorteners} = $pms->{main}->{conf}->{url_shorteners};
($self->{url_shortener_log}) = ($pms->{main}->{conf}->{url_shortener_log} =~ /^(.*)$/g);
($self->{url_shortener_cache}) = ($pms->{main}->{conf}->{url_shortener_cache} =~ /^(.*)$/g);
$self->{url_shortener_cache_ttl} = $pms->{main}->{conf}->{url_shortener_cache_ttl};
$self->{url_shortener_syslog} = $pms->{main}->{conf}->{url_shortener_syslog};
# Sort short URLs into hash to de-dup them
my %short_urls;
my $uris = $pms->get_uri_detail_list();
while (my($uri, $info) = each %{$uris}) {
next unless ($info->{domains});
foreach ( keys %{ $info->{domains} } ) {
if (exists $self->{url_shorteners}->{lc $_}) {
# NOTE: $info->{domains} appears to contain all the domains parsed
# from the single input URI with no way to work out what the base
# domain is. So to prevent someone from stuffing the URI with a
# shortener to force this plug-in to follow a link that *isn't* on
# the list of shorteners; we enforce that the shortener must be the
# base URI and that a path must be present.
if ($uri !~ /^http:\/\/(?:www\.)?$_\/.+$/) {
dbg("Discarding URI: $uri");
next;
}
$short_urls{$uri} = 1;
next;
}
}
}
# Make sure we have some work to do
# Before we open any log files etc.
my $count = scalar keys %short_urls;
return undef unless $count gt 0;
# Initialise logging if enabled
if ($self->{url_shortener_log}) {
eval {
local $SIG{'__DIE__'};
open($self->{logfh}, '>>'.$self->{url_shortener_log}) or die $!;
};
if ($@) {
dbg("warn: $@");
} else {
$self->{logging} = 1;
}
}
# Initialise syslog if enabled
if ($self->{url_shortener_syslog}) {
eval {
local $SIG{'__DIE__'};
openlog('DecodeShortURLs','ndelay,pid','mail');
};
if ($@) {
dbg("warn: $@");
} else {
$self->{syslog} = 1;
}
}
# Initialise cache if enabled
if ($self->{url_shortener_cache} && HAS_SQLITE) {
eval {
local $SIG{'__DIE__'};
$self->{dbh} = DBI->connect_cached("dbi:SQLite:dbname=".$self->{url_shortener_cache},"","", {RaiseError => 1, PrintError => 0, InactiveDestroy => 1}) or die $!;
};
if ($@) {
dbg("warn: $@");
} else {
$self->{caching} = 1;
# Create database if needed
eval {
local $SIG{'__DIE__'};
$self->{dbh}->do("
CREATE TABLE IF NOT EXISTS short_url_cache (
short_url TEXT PRIMARY KEY NOT NULL,
decoded_url TEXT NOT NULL,
hits INTEGER NOT NULL DEFAULT 1,
created INTEGER NOT NULL DEFAULT (strftime('%s','now')),
modified INTEGER NOT NULL DEFAULT (strftime('%s','now'))
)
");
$self->{dbh}->do("
CREATE INDEX IF NOT EXISTS short_url_by_modified
ON short_url_cache(short_url, modified)
");
$self->{dbh}->do("
CREATE INDEX IF NOT EXISTS short_url_modified
ON short_url_cache(modified)
");
};
if ($@) {
dbg("warn: $@");
$self->{caching} = 0;
}
}
}
my $max_short_urls = 10;
foreach my $short_url (keys %short_urls) {
next if ($max_short_urls le 0);
my $location = $self->recursive_lookup($short_url, $pms);
$max_short_urls--;
}
# Close log
eval {
local $SIG{'__DIE__'};
close($self->{logfh}) or die $!;
} if $self->{logging};
# Close syslog
eval {
local $SIG{'__DIE__'};
closelog() or die $!;
} if $self->{syslog};
# Don't disconnect cached database handle
# eval { $self->{dbh}->disconnect() or die $!; } if $self->{caching};
}
sub recursive_lookup {
my ($self, $short_url, $pms, %been_here) = @_;
my $count = scalar keys %been_here;
dbg("Redirection count $count") if $count gt 0;
if ($count ge 10) {
dbg("Error: more than 10 shortener redirections");
# Fire test
$pms->got_hit('SHORT_URL_MAXCHAIN');
return undef;
}
my $location;
if ($self->{caching} && ($location = $self->cache_get($short_url))) {
dbg("Found cached $short_url => $location");
eval {
local $SIG{'__DIE__'};
$self->log_to_file("$short_url => $location")
} if $self->{logging};
syslog('info',"Found cached $short_url => $location") if $self->{syslog};
} else {
# Not cached; do lookup
my $response = $self->{ua}->head($short_url);
if (!$response->is_redirect) {
dbg("Skipping URL as not redirect: $short_url = ".$response->status_line);
$pms->got_hit('SHORT_URL_404') if($response->code == '404');
return undef;
}
$location = $response->headers->{location};
# Bail out if $short_url redirects to itself
return undef if ($short_url eq $location);
$self->cache_add($short_url, $location) if $self->{caching};
dbg("Found $short_url => $location");
eval {
local $SIG{'__DIE__'};
$self->log_to_file("$short_url => $location")
} if $self->{logging};
syslog('info',"Found $short_url => $location") if $self->{syslog};
}
# At this point we have a new URL in $response
$pms->got_hit('HAS_SHORT_URL');
_add_uri_detail_list($pms, $location);
# Set chained here otherwise we might mark a disabled page or
# redirect back to the same host as chaining incorrectly.
$pms->got_hit('SHORT_URL_CHAINED') if ($count gt 0);
# Check if we are being redirected to a local page
# Don't recurse in this case...
if($location !~ /^https?:/) {
my($host) = ($short_url =~ /^(https?:\/\/\S+)\//);
$location = "$host/$location";
dbg("Looks like a local redirection: $short_url => $location");
_add_uri_detail_list($pms, $location);
return $location;
}
# Check for recursion
if ((my ($domain) = ($location =~ /^https?:\/\/(\S+)\//))) {
if (exists $been_here{$location}) {
# Loop detected
dbg("Error: loop detected");
$pms->got_hit('SHORT_URL_LOOP');
return $location;
} else {
if (exists $self->{url_shorteners}->{$domain}) {
$been_here{$location} = 1;
# Recurse...
return $self->recursive_lookup($location, $pms, %been_here);
}
}
}
# No recursion; just return the final location...
return $location;
}
sub short_url_tests {
# Set by parsed_metadata
return 0;
}
# Beware. Code copied from PerMsgStatus get_uri_detail_list().
# Stolen from GUDO.pm
sub _add_uri_detail_list {
my ($pms, $uri) = @_;
my $info;
# Cache of text parsed URIs, as previously used by get_uri_detail_list().
push @{$pms->{parsed_uri_list}}, $uri;
$info->{types}->{parsed} = 1;
$info->{cleaned} =
[Mail::SpamAssassin::Util::uri_list_canonify (undef, $uri)];
foreach (@{$info->{cleaned}}) {
my $dom = Mail::SpamAssassin::Util::uri_to_domain($_);
if ($dom && !$info->{domains}->{$dom}) {
$info->{domains}->{$dom} = 1;
$pms->{uri_domain_count}++;
}
}
$pms->{uri_detail_list}->{$uri} = $info;
# And of course, copied code from PerMsgStatus get_uri_list(). *sigh*
dbg ('warn: PMS::get_uri_list() appears to have been harvested'),
push @{$pms->{uri_list}}, @{$info->{cleaned}}
if exists $pms->{uri_list};
}
sub log_to_file {
my ($self, $msg) = @_;
return undef if not $self->{logging};
my $fh = $self->{logfh};
eval {
flock($fh, LOCK_EX) or die $!;
seek($fh, 0, SEEK_END) or die $!;
print $fh '['.time.'] '.$msg."\n";
flock($fh, LOCK_UN) or die $!;
};
}
sub cache_add {
my ($self, $short_url, $decoded_url) = @_;
return undef if not $self->{caching};
eval {
$self->{sth_insert} = $self->{dbh}->prepare_cached("
INSERT INTO short_url_cache (short_url, decoded_url)
VALUES (?,?)
");
};
if ($@) {
dbg("warn: $@");
return undef;
};
$self->{sth_insert}->execute($short_url, $decoded_url);
return undef;
}
sub cache_get {
my ($self, $key) = @_;
return undef if not $self->{caching};
eval {
$self->{sth_select} = $self->{dbh}->prepare_cached("
SELECT decoded_url FROM short_url_cache
WHERE short_url = ? AND modified > (strftime('%s','now') - ?)
");
};
if ($@) {
dbg("warn: $@");
return undef;
}
eval {
$self->{sth_update} = $self->{dbh}->prepare_cached("
UPDATE short_url_cache
SET modified=strftime('%s','now'), hits=hits+1
WHERE short_url = ?
");
};
if ($@) {
dbg("warn: $@");
return undef;
}
$self->{sth_select}->execute($key, $self->{url_shortener_cache_ttl});
my $row = $self->{sth_select}->fetchrow_array();
if($row) {
# Found cache entry; touch it to prevent expiry
$self->{sth_update}->execute($key);
$self->{sth_select}->finish();
$self->{sth_update}->finish();
return $row;
}
$self->{sth_select}->finish();
$self->{sth_update}->finish();
return undef;
}
1;