238 lines
7.3 KiB
Perl
238 lines
7.3 KiB
Perl
#!/usr/bin/perl
|
|
# /lib 20030227
|
|
# based on SpamAssassin's sa-learn
|
|
|
|
use strict;
|
|
use warnings;
|
|
|
|
my $PREFIX = '/usr/local/stow/perl-5.6.1'; # substituted at 'make' time
|
|
my $DEF_RULES_DIR = '/usr/local/stow/perl-5.6.1/share/spamassassin'; # substituted at 'make' time
|
|
my $LOCAL_RULES_DIR = '/etc/mail/spamassassin'; # substituted at 'make' time
|
|
|
|
use Mail::SpamAssassin;
|
|
use Mail::SpamAssassin::ArchiveIterator;
|
|
#use Mail::SpamAssassin::NoMailAudit;
|
|
use Mail::SpamAssassin::PerMsgLearner;
|
|
|
|
use Getopt::Long;
|
|
use Pod::Usage;
|
|
|
|
use MIME::Parser ();
|
|
|
|
Getopt::Long::Configure(qw(bundling no_getopt_compat
|
|
no_auto_abbrev no_ignore_case));
|
|
|
|
my ($isspam, $forget, %opt);
|
|
|
|
GetOptions(
|
|
'spam' => sub { $isspam = 1; },
|
|
'ham|nonspam' => sub { $isspam = 0; },
|
|
'forget' => \$forget,
|
|
'config-file|C=s' => \$opt{'config-file'},
|
|
'prefs-file|p=s' => \$opt{'prefs-file'},
|
|
|
|
'no-rebuild|norebuild' => \$opt{'norebuild'},
|
|
'force-expire' => \$opt{'force-expire'},
|
|
|
|
'randseed=i' => \$opt{'randseed'},
|
|
|
|
'auto-whitelist|a' => \$opt{'auto-whitelist'},
|
|
'bias-scores|b' => \$opt{'bias-scores'},
|
|
|
|
'debug-level|D' => \$opt{'debug-level'},
|
|
'version|V' => \$opt{'version'},
|
|
'help|h|?' => \$opt{'help'},
|
|
) or usage(0, "Unknown option!");
|
|
|
|
|
|
if (defined $opt{'help'}) { usage(0, "For more information read the manual page"); }
|
|
if (defined $opt{'version'}) {
|
|
print "SpamAssassin version " . Mail::SpamAssassin::Version() . "\n";
|
|
exit 0;
|
|
}
|
|
if ( !defined $isspam && !defined $forget ) {
|
|
usage(0, "Please select either --spam, --ham, or --forget");
|
|
}
|
|
|
|
# create the tester factory
|
|
my $spamtest = new Mail::SpamAssassin ({
|
|
rules_filename => $opt{'config-file'},
|
|
userprefs_filename => $opt{'prefs-file'},
|
|
debug => defined($opt{'debug-level'}),
|
|
local_tests_only => 1,
|
|
dont_copy_prefs => 1,
|
|
PREFIX => $PREFIX,
|
|
DEF_RULES_DIR => $DEF_RULES_DIR,
|
|
LOCAL_RULES_DIR => $LOCAL_RULES_DIR,
|
|
});
|
|
|
|
$spamtest->init (1);
|
|
|
|
$spamtest->init_learner({
|
|
use_whitelist => $opt{'auto-whitelist'},
|
|
bias_scores => $opt{'bias-scores'},
|
|
force_expire => $opt{'force-expire'},
|
|
caller_will_untie => 1,
|
|
});
|
|
|
|
if (defined $opt{'randseed'}) {
|
|
srand ($opt{'randseed'});
|
|
}
|
|
|
|
# run this lot in an eval block, so we can catch die's and clear
|
|
# up the dbs.
|
|
eval {
|
|
$SIG{INT} = \&killed;
|
|
$SIG{TERM} = \&killed;
|
|
|
|
# new MIME Parser:
|
|
my $parser = new MIME::Parser;
|
|
|
|
# don't parse rfc/822 sub-messages:
|
|
$parser->extract_nested_messages(0);
|
|
|
|
# don't create files:
|
|
$parser->output_to_core(1);
|
|
|
|
# now parse the message: ($entity is a MIME::Entity)
|
|
my $entity = $parser->parse(\*STDIN) or die "parse failed\n";
|
|
|
|
# must be multipart message:
|
|
$entity->is_multipart() or die "is not multipart\n";
|
|
|
|
my $messagecount = 0;
|
|
|
|
# loop over the parts: ($part is a MIME::Entity)
|
|
foreach my $part ($entity->parts()) {
|
|
|
|
my $effective_type = $part->effective_type;
|
|
|
|
# skip if not a message sub-part:
|
|
next unless $effective_type =~ m{^message/};
|
|
|
|
my $body = $part->stringify_body();
|
|
my @body = split (/^/m, $body);
|
|
my $dataref = \@body;
|
|
|
|
# my $ma = Mail::SpamAssassin::NoMailAudit->new ('data' => $dataref);
|
|
my $ma = $spamtest->parse($dataref);
|
|
if ($ma->get_pristine_header("X-Spam-Status")) {
|
|
my $newtext = $spamtest->remove_spamassassin_markup($ma);
|
|
my @newtext = split (/^/m, $newtext);
|
|
$dataref = \@newtext;
|
|
# $ma = Mail::SpamAssassin::NoMailAudit->new ('data' => $dataref);
|
|
$ma = $spamtest->parse($dataref);
|
|
}
|
|
|
|
$ma->{noexit} = 1;
|
|
|
|
my $learner = $spamtest->learn ($ma, undef, $isspam, $forget);
|
|
$messagecount++ if ($learner->did_learn());
|
|
$learner->finish();
|
|
|
|
}
|
|
|
|
warn "Learned from $messagecount messages.\n";
|
|
|
|
if (!$opt{norebuild}) {
|
|
$spamtest->rebuild_learner_caches();
|
|
}
|
|
};
|
|
|
|
|
|
if ($@) {
|
|
my $failure = $@;
|
|
$spamtest->finish_learner();
|
|
die $failure;
|
|
}
|
|
|
|
$spamtest->finish_learner();
|
|
exit 0;
|
|
|
|
sub killed {
|
|
$spamtest->finish_learner();
|
|
die "interrupted";
|
|
}
|
|
|
|
|
|
sub usage {
|
|
my ($verbose, $message) = @_;
|
|
my $ver = Mail::SpamAssassin::Version();
|
|
print "SpamAssassin version $ver\n";
|
|
pod2usage(-verbose => $verbose, -message => $message, -exitval => 64);
|
|
}
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
=head1 NAME
|
|
|
|
sa-learn-attach - train SpamAssassin's Bayesian classifier via attachments
|
|
|
|
=head1 SYNOPSIS
|
|
|
|
B<sa-learn-attach> [options] < I<message>
|
|
|
|
Options:
|
|
|
|
--ham Learn messages as ham
|
|
--spam Learn messages as spam
|
|
--forget Forget a message
|
|
--no-rebuild Skip building databases after scan
|
|
-C file, --config-file=file Path to standard configuration dir
|
|
-p prefs, --prefs-file=file Set user preferences file
|
|
-a, --auto-whitelist Use auto-whitelists
|
|
-D, --debug-level Print debugging messages
|
|
-V, --version Print version
|
|
-h, --help Print usage message
|
|
|
|
=head1 DESCRIPTION
|
|
|
|
This behaves just like SpamAssassin's B<sa-learn>, except it takes just one
|
|
message, as standard input. It strips out message attachments to that
|
|
message, and learns from each of those attachments. Non-message
|
|
attachments are silently ignored.
|
|
|
|
This means you can forward misclassified messages from within your mailer
|
|
to special accounts that will tell SpamAssassin that a given set of
|
|
messages were misclassified. This avoids the additional "Received" headers
|
|
that would occur using a mailer's "re-mail" or "bounce" feature.
|
|
|
|
For example, one could set up the following procmail recipe, for a user
|
|
xyz@myhome.org:
|
|
|
|
:0
|
|
* ^TOxyz\+sa-learn-\/(ham|spam|forget)
|
|
| /usr/local/bin/sa-learn-attach --$MATCH
|
|
|
|
This relies on a slight non-standard email extension sendmail allows (and
|
|
most other MTAs) which recognises E<lt>xyz+ anything@myhome.orgE<gt> as
|
|
really going to E<lt> xyz@myhome.orgE<gt>, and requires procmail 3.10 or
|
|
later for MATCH. You may wish to add some more rules to make it more
|
|
stringent (i.e., only when you send it).
|
|
|
|
Like B<sa-learn>, B<sa-learn-attach> removes SpamAssassin markup, if any,
|
|
in each message before learning, so you can just forward misclassified ham
|
|
rather than the original message.
|
|
|
|
By default, B<sa-learn-attach> rebuilds the Bayesian database after
|
|
learning all the messages. This takes some time, so it is probably
|
|
sensible to combine all misclassified spam into one message before
|
|
forwarding it to E<lt>xyz+ sa-learn-spam@myhome.orgE<gt>.
|
|
|
|
B<sa-learn-attach> uses the B<MIME::Tools> package to parse attachments
|
|
whereas SpamAssassin does not depend on B<MIME::Tools>.
|
|
|
|
=head1 SEE ALSO
|
|
|
|
sa-learn(1)
|
|
Mail::SpamAssassin(3)
|
|
spamassassin(1)
|
|
|
|
=head1 AUTHOR
|
|
|
|
Bill Clarke (/lib) E<lt>llib /at/ computer.orgE<gt>
|
|
with huge swathes of code taken directly from B<sa-learn> by Justin Mason.
|
|
|
|
=cut
|