zira-etc/mail/spamassassin/sa-train.pl

#!/usr/bin/perl
# /lib 20030227
# based on SpamAssassin's sa-learn

use strict;
use warnings;

my $PREFIX = '/usr/local/stow/perl-5.6.1';  # substituted at 'make' time
my $DEF_RULES_DIR = '/usr/local/stow/perl-5.6.1/share/spamassassin';  # substituted at 'make' time
my $LOCAL_RULES_DIR = '/etc/mail/spamassassin';  # substituted at 'make' time

use Mail::SpamAssassin;
use Mail::SpamAssassin::ArchiveIterator;
#use Mail::SpamAssassin::NoMailAudit;
use Mail::SpamAssassin::PerMsgLearner;

use Getopt::Long;
use Pod::Usage;

use MIME::Parser ();

Getopt::Long::Configure(qw(bundling no_getopt_compat
                           no_auto_abbrev no_ignore_case));

my ($isspam, $forget, %opt);

GetOptions(
           'spam'                               => sub { $isspam = 1; },
           'ham|nonspam'                        => sub { $isspam = 0; },
           'forget'                             => \$forget,
           'config-file|C=s'                    => \$opt{'config-file'},
           'prefs-file|p=s'                     => \$opt{'prefs-file'},

           'no-rebuild|norebuild'               => \$opt{'norebuild'},
           'force-expire'                       => \$opt{'force-expire'},

           'randseed=i'                         => \$opt{'randseed'},

           'auto-whitelist|a'                   => \$opt{'auto-whitelist'},
           'bias-scores|b'                      => \$opt{'bias-scores'},

           'debug-level|D'                      => \$opt{'debug-level'},
           'version|V'                          => \$opt{'version'},
           'help|h|?'                           => \$opt{'help'},
           ) or usage(0, "Unknown option!");


if (defined $opt{'help'}) { usage(0, "For more information read the manual page"); }
if (defined $opt{'version'}) {
    print "SpamAssassin version " . Mail::SpamAssassin::Version() . "\n";
    exit 0;
}
if ( !defined $isspam && !defined $forget ) {
    usage(0, "Please select either --spam, --ham, or --forget");
}

# create the tester factory
my $spamtest = new Mail::SpamAssassin ({
    rules_filename      => $opt{'config-file'},
    userprefs_filename  => $opt{'prefs-file'},
    debug               => defined($opt{'debug-level'}),
    local_tests_only    => 1,
    dont_copy_prefs     => 1,
    PREFIX              => $PREFIX,
    DEF_RULES_DIR       => $DEF_RULES_DIR,
    LOCAL_RULES_DIR     => $LOCAL_RULES_DIR,
});

$spamtest->init (1);

$spamtest->init_learner({
    use_whitelist       => $opt{'auto-whitelist'},
    bias_scores         => $opt{'bias-scores'},
    force_expire        => $opt{'force-expire'},
    caller_will_untie   => 1,
});

if (defined $opt{'randseed'}) {
    srand ($opt{'randseed'});
}

# run this lot in an eval block, so we can catch die's and clear
# up the dbs.
eval {
    $SIG{INT} = \&killed;
    $SIG{TERM} = \&killed;

    # new MIME Parser:
    my $parser = new MIME::Parser;

    # don't parse rfc/822 sub-messages:
    $parser->extract_nested_messages(0);

    # don't create files:
    $parser->output_to_core(1);

    # now parse the message: ($entity is a MIME::Entity)
    my $entity = $parser->parse(\*STDIN) or die "parse failed\n";

    # must be multipart message:
    $entity->is_multipart() or die "is not multipart\n";

    my $messagecount = 0;

    # loop over the parts: ($part is a MIME::Entity)
    foreach my $part ($entity->parts()) {

        my $effective_type = $part->effective_type;

        # skip if not a message sub-part:
        next unless $effective_type =~ m{^message/};

        my $body = $part->stringify_body();
        my @body = split (/^/m, $body);
        my $dataref = \@body;

#       my $ma = Mail::SpamAssassin::NoMailAudit->new ('data' => $dataref);
        my $ma = $spamtest->parse($dataref);
        if ($ma->get_pristine_header("X-Spam-Status")) {
            my $newtext = $spamtest->remove_spamassassin_markup($ma);
            my @newtext = split (/^/m, $newtext);
            $dataref = \@newtext;
           # $ma = Mail::SpamAssassin::NoMailAudit->new ('data' => $dataref);
            $ma = $spamtest->parse($dataref);
        }

        $ma->{noexit} = 1;

        my $learner = $spamtest->learn ($ma, undef, $isspam, $forget);
        $messagecount++ if ($learner->did_learn());
        $learner->finish();

    }

    warn "Learned from $messagecount messages.\n";

    if (!$opt{norebuild}) {
        $spamtest->rebuild_learner_caches();
    }
};


if ($@) {
    my $failure = $@;
    $spamtest->finish_learner();
    die $failure;
}

$spamtest->finish_learner();
exit 0;

sub killed {
  $spamtest->finish_learner();
  die "interrupted";
}


sub usage {
    my ($verbose, $message) = @_;
    my $ver = Mail::SpamAssassin::Version();
    print "SpamAssassin version $ver\n";
    pod2usage(-verbose => $verbose, -message => $message, -exitval => 64);
}


# ---------------------------------------------------------------------------

=head1 NAME

sa-learn-attach - train SpamAssassin's Bayesian classifier via attachments

=head1 SYNOPSIS

B<sa-learn-attach> [options] < I<message>

Options:

 --ham                             Learn messages as ham
 --spam                            Learn messages as spam
 --forget                          Forget a message
 --no-rebuild                      Skip building databases after scan
 -C file, --config-file=file       Path to standard configuration dir
 -p prefs, --prefs-file=file       Set user preferences file
 -a, --auto-whitelist              Use auto-whitelists
 -D, --debug-level                 Print debugging messages
 -V, --version                     Print version
 -h, --help                        Print usage message

=head1 DESCRIPTION

This behaves just like SpamAssassin's B<sa-learn>, except it takes just one
message, as standard input.  It strips out message attachments to that
message, and learns from each of those attachments.  Non-message
attachments are silently ignored.

This means you can forward misclassified messages from within your mailer
to special accounts that will tell SpamAssassin that a given set of
messages were misclassified.  This avoids the additional "Received" headers
that would occur using a mailer's "re-mail" or "bounce" feature.

For example, one could set up the following procmail recipe, for a user
xyz@myhome.org:

 :0
 * ^TOxyz\+sa-learn-\/(ham|spam|forget)
 | /usr/local/bin/sa-learn-attach --$MATCH

This relies on a slight non-standard email extension sendmail allows (and
most other MTAs) which recognises E<lt>xyz+ anything@myhome.orgE<gt> as
really going to E<lt> xyz@myhome.orgE<gt>, and requires procmail 3.10 or
later for MATCH.  You may wish to add some more rules to make it more
stringent (i.e., only when you send it).

Like B<sa-learn>, B<sa-learn-attach> removes SpamAssassin markup, if any,
in each message before learning, so you can just forward misclassified ham
rather than the original message.

By default, B<sa-learn-attach> rebuilds the Bayesian database after
learning all the messages.  This takes some time, so it is probably
sensible to combine all misclassified spam into one message before
forwarding it to E<lt>xyz+ sa-learn-spam@myhome.orgE<gt>.

B<sa-learn-attach> uses the B<MIME::Tools> package to parse attachments
whereas SpamAssassin does not depend on B<MIME::Tools>.

=head1 SEE ALSO

sa-learn(1)
Mail::SpamAssassin(3)
spamassassin(1)

=head1 AUTHOR

Bill Clarke (/lib) E<lt>llib /at/ computer.orgE<gt>
with huge swathes of code taken directly from B<sa-learn> by Justin Mason.

=cut