Files
zira-etc/mail/spamassassin/sa-train.pl
2021-05-24 22:18:33 +03:00

238 lines
7.3 KiB
Perl

#!/usr/bin/perl
# /lib 20030227
# based on SpamAssassin's sa-learn
use strict;
use warnings;
my $PREFIX = '/usr/local/stow/perl-5.6.1'; # substituted at 'make' time
my $DEF_RULES_DIR = '/usr/local/stow/perl-5.6.1/share/spamassassin'; # substituted at 'make' time
my $LOCAL_RULES_DIR = '/etc/mail/spamassassin'; # substituted at 'make' time
use Mail::SpamAssassin;
use Mail::SpamAssassin::ArchiveIterator;
#use Mail::SpamAssassin::NoMailAudit;
use Mail::SpamAssassin::PerMsgLearner;
use Getopt::Long;
use Pod::Usage;
use MIME::Parser ();
Getopt::Long::Configure(qw(bundling no_getopt_compat
no_auto_abbrev no_ignore_case));
my ($isspam, $forget, %opt);
GetOptions(
'spam' => sub { $isspam = 1; },
'ham|nonspam' => sub { $isspam = 0; },
'forget' => \$forget,
'config-file|C=s' => \$opt{'config-file'},
'prefs-file|p=s' => \$opt{'prefs-file'},
'no-rebuild|norebuild' => \$opt{'norebuild'},
'force-expire' => \$opt{'force-expire'},
'randseed=i' => \$opt{'randseed'},
'auto-whitelist|a' => \$opt{'auto-whitelist'},
'bias-scores|b' => \$opt{'bias-scores'},
'debug-level|D' => \$opt{'debug-level'},
'version|V' => \$opt{'version'},
'help|h|?' => \$opt{'help'},
) or usage(0, "Unknown option!");
if (defined $opt{'help'}) { usage(0, "For more information read the manual page"); }
if (defined $opt{'version'}) {
print "SpamAssassin version " . Mail::SpamAssassin::Version() . "\n";
exit 0;
}
if ( !defined $isspam && !defined $forget ) {
usage(0, "Please select either --spam, --ham, or --forget");
}
# create the tester factory
my $spamtest = new Mail::SpamAssassin ({
rules_filename => $opt{'config-file'},
userprefs_filename => $opt{'prefs-file'},
debug => defined($opt{'debug-level'}),
local_tests_only => 1,
dont_copy_prefs => 1,
PREFIX => $PREFIX,
DEF_RULES_DIR => $DEF_RULES_DIR,
LOCAL_RULES_DIR => $LOCAL_RULES_DIR,
});
$spamtest->init (1);
$spamtest->init_learner({
use_whitelist => $opt{'auto-whitelist'},
bias_scores => $opt{'bias-scores'},
force_expire => $opt{'force-expire'},
caller_will_untie => 1,
});
if (defined $opt{'randseed'}) {
srand ($opt{'randseed'});
}
# run this lot in an eval block, so we can catch die's and clear
# up the dbs.
eval {
$SIG{INT} = \&killed;
$SIG{TERM} = \&killed;
# new MIME Parser:
my $parser = new MIME::Parser;
# don't parse rfc/822 sub-messages:
$parser->extract_nested_messages(0);
# don't create files:
$parser->output_to_core(1);
# now parse the message: ($entity is a MIME::Entity)
my $entity = $parser->parse(\*STDIN) or die "parse failed\n";
# must be multipart message:
$entity->is_multipart() or die "is not multipart\n";
my $messagecount = 0;
# loop over the parts: ($part is a MIME::Entity)
foreach my $part ($entity->parts()) {
my $effective_type = $part->effective_type;
# skip if not a message sub-part:
next unless $effective_type =~ m{^message/};
my $body = $part->stringify_body();
my @body = split (/^/m, $body);
my $dataref = \@body;
# my $ma = Mail::SpamAssassin::NoMailAudit->new ('data' => $dataref);
my $ma = $spamtest->parse($dataref);
if ($ma->get_pristine_header("X-Spam-Status")) {
my $newtext = $spamtest->remove_spamassassin_markup($ma);
my @newtext = split (/^/m, $newtext);
$dataref = \@newtext;
# $ma = Mail::SpamAssassin::NoMailAudit->new ('data' => $dataref);
$ma = $spamtest->parse($dataref);
}
$ma->{noexit} = 1;
my $learner = $spamtest->learn ($ma, undef, $isspam, $forget);
$messagecount++ if ($learner->did_learn());
$learner->finish();
}
warn "Learned from $messagecount messages.\n";
if (!$opt{norebuild}) {
$spamtest->rebuild_learner_caches();
}
};
if ($@) {
my $failure = $@;
$spamtest->finish_learner();
die $failure;
}
$spamtest->finish_learner();
exit 0;
sub killed {
$spamtest->finish_learner();
die "interrupted";
}
sub usage {
my ($verbose, $message) = @_;
my $ver = Mail::SpamAssassin::Version();
print "SpamAssassin version $ver\n";
pod2usage(-verbose => $verbose, -message => $message, -exitval => 64);
}
# ---------------------------------------------------------------------------
=head1 NAME
sa-learn-attach - train SpamAssassin's Bayesian classifier via attachments
=head1 SYNOPSIS
B<sa-learn-attach> [options] < I<message>
Options:
--ham Learn messages as ham
--spam Learn messages as spam
--forget Forget a message
--no-rebuild Skip building databases after scan
-C file, --config-file=file Path to standard configuration dir
-p prefs, --prefs-file=file Set user preferences file
-a, --auto-whitelist Use auto-whitelists
-D, --debug-level Print debugging messages
-V, --version Print version
-h, --help Print usage message
=head1 DESCRIPTION
This behaves just like SpamAssassin's B<sa-learn>, except it takes just one
message, as standard input. It strips out message attachments to that
message, and learns from each of those attachments. Non-message
attachments are silently ignored.
This means you can forward misclassified messages from within your mailer
to special accounts that will tell SpamAssassin that a given set of
messages were misclassified. This avoids the additional "Received" headers
that would occur using a mailer's "re-mail" or "bounce" feature.
For example, one could set up the following procmail recipe, for a user
xyz@myhome.org:
:0
* ^TOxyz\+sa-learn-\/(ham|spam|forget)
| /usr/local/bin/sa-learn-attach --$MATCH
This relies on a slight non-standard email extension sendmail allows (and
most other MTAs) which recognises E<lt>xyz+ anything@myhome.orgE<gt> as
really going to E<lt> xyz@myhome.orgE<gt>, and requires procmail 3.10 or
later for MATCH. You may wish to add some more rules to make it more
stringent (i.e., only when you send it).
Like B<sa-learn>, B<sa-learn-attach> removes SpamAssassin markup, if any,
in each message before learning, so you can just forward misclassified ham
rather than the original message.
By default, B<sa-learn-attach> rebuilds the Bayesian database after
learning all the messages. This takes some time, so it is probably
sensible to combine all misclassified spam into one message before
forwarding it to E<lt>xyz+ sa-learn-spam@myhome.orgE<gt>.
B<sa-learn-attach> uses the B<MIME::Tools> package to parse attachments
whereas SpamAssassin does not depend on B<MIME::Tools>.
=head1 SEE ALSO
sa-learn(1)
Mail::SpamAssassin(3)
spamassassin(1)
=head1 AUTHOR
Bill Clarke (/lib) E<lt>llib /at/ computer.orgE<gt>
with huge swathes of code taken directly from B<sa-learn> by Justin Mason.
=cut