#!/usr/bin/perl # /lib 20030227 # based on SpamAssassin's sa-learn use strict; use warnings; my $PREFIX = '/usr/local/stow/perl-5.6.1'; # substituted at 'make' time my $DEF_RULES_DIR = '/usr/local/stow/perl-5.6.1/share/spamassassin'; # substituted at 'make' time my $LOCAL_RULES_DIR = '/etc/mail/spamassassin'; # substituted at 'make' time use Mail::SpamAssassin; use Mail::SpamAssassin::ArchiveIterator; #use Mail::SpamAssassin::NoMailAudit; use Mail::SpamAssassin::PerMsgLearner; use Getopt::Long; use Pod::Usage; use MIME::Parser (); Getopt::Long::Configure(qw(bundling no_getopt_compat no_auto_abbrev no_ignore_case)); my ($isspam, $forget, %opt); GetOptions( 'spam' => sub { $isspam = 1; }, 'ham|nonspam' => sub { $isspam = 0; }, 'forget' => \$forget, 'config-file|C=s' => \$opt{'config-file'}, 'prefs-file|p=s' => \$opt{'prefs-file'}, 'no-rebuild|norebuild' => \$opt{'norebuild'}, 'force-expire' => \$opt{'force-expire'}, 'randseed=i' => \$opt{'randseed'}, 'auto-whitelist|a' => \$opt{'auto-whitelist'}, 'bias-scores|b' => \$opt{'bias-scores'}, 'debug-level|D' => \$opt{'debug-level'}, 'version|V' => \$opt{'version'}, 'help|h|?' => \$opt{'help'}, ) or usage(0, "Unknown option!"); if (defined $opt{'help'}) { usage(0, "For more information read the manual page"); } if (defined $opt{'version'}) { print "SpamAssassin version " . Mail::SpamAssassin::Version() . "\n"; exit 0; } if ( !defined $isspam && !defined $forget ) { usage(0, "Please select either --spam, --ham, or --forget"); } # create the tester factory my $spamtest = new Mail::SpamAssassin ({ rules_filename => $opt{'config-file'}, userprefs_filename => $opt{'prefs-file'}, debug => defined($opt{'debug-level'}), local_tests_only => 1, dont_copy_prefs => 1, PREFIX => $PREFIX, DEF_RULES_DIR => $DEF_RULES_DIR, LOCAL_RULES_DIR => $LOCAL_RULES_DIR, }); $spamtest->init (1); $spamtest->init_learner({ use_whitelist => $opt{'auto-whitelist'}, bias_scores => $opt{'bias-scores'}, force_expire => $opt{'force-expire'}, caller_will_untie => 1, }); if (defined $opt{'randseed'}) { srand ($opt{'randseed'}); } # run this lot in an eval block, so we can catch die's and clear # up the dbs. eval { $SIG{INT} = \&killed; $SIG{TERM} = \&killed; # new MIME Parser: my $parser = new MIME::Parser; # don't parse rfc/822 sub-messages: $parser->extract_nested_messages(0); # don't create files: $parser->output_to_core(1); # now parse the message: ($entity is a MIME::Entity) my $entity = $parser->parse(\*STDIN) or die "parse failed\n"; # must be multipart message: $entity->is_multipart() or die "is not multipart\n"; my $messagecount = 0; # loop over the parts: ($part is a MIME::Entity) foreach my $part ($entity->parts()) { my $effective_type = $part->effective_type; # skip if not a message sub-part: next unless $effective_type =~ m{^message/}; my $body = $part->stringify_body(); my @body = split (/^/m, $body); my $dataref = \@body; # my $ma = Mail::SpamAssassin::NoMailAudit->new ('data' => $dataref); my $ma = $spamtest->parse($dataref); if ($ma->get_pristine_header("X-Spam-Status")) { my $newtext = $spamtest->remove_spamassassin_markup($ma); my @newtext = split (/^/m, $newtext); $dataref = \@newtext; # $ma = Mail::SpamAssassin::NoMailAudit->new ('data' => $dataref); $ma = $spamtest->parse($dataref); } $ma->{noexit} = 1; my $learner = $spamtest->learn ($ma, undef, $isspam, $forget); $messagecount++ if ($learner->did_learn()); $learner->finish(); } warn "Learned from $messagecount messages.\n"; if (!$opt{norebuild}) { $spamtest->rebuild_learner_caches(); } }; if ($@) { my $failure = $@; $spamtest->finish_learner(); die $failure; } $spamtest->finish_learner(); exit 0; sub killed { $spamtest->finish_learner(); die "interrupted"; } sub usage { my ($verbose, $message) = @_; my $ver = Mail::SpamAssassin::Version(); print "SpamAssassin version $ver\n"; pod2usage(-verbose => $verbose, -message => $message, -exitval => 64); } # --------------------------------------------------------------------------- =head1 NAME sa-learn-attach - train SpamAssassin's Bayesian classifier via attachments =head1 SYNOPSIS B [options] < I Options: --ham Learn messages as ham --spam Learn messages as spam --forget Forget a message --no-rebuild Skip building databases after scan -C file, --config-file=file Path to standard configuration dir -p prefs, --prefs-file=file Set user preferences file -a, --auto-whitelist Use auto-whitelists -D, --debug-level Print debugging messages -V, --version Print version -h, --help Print usage message =head1 DESCRIPTION This behaves just like SpamAssassin's B, except it takes just one message, as standard input. It strips out message attachments to that message, and learns from each of those attachments. Non-message attachments are silently ignored. This means you can forward misclassified messages from within your mailer to special accounts that will tell SpamAssassin that a given set of messages were misclassified. This avoids the additional "Received" headers that would occur using a mailer's "re-mail" or "bounce" feature. For example, one could set up the following procmail recipe, for a user xyz@myhome.org: :0 * ^TOxyz\+sa-learn-\/(ham|spam|forget) | /usr/local/bin/sa-learn-attach --$MATCH This relies on a slight non-standard email extension sendmail allows (and most other MTAs) which recognises Exyz+ anything@myhome.orgE as really going to E xyz@myhome.orgE, and requires procmail 3.10 or later for MATCH. You may wish to add some more rules to make it more stringent (i.e., only when you send it). Like B, B removes SpamAssassin markup, if any, in each message before learning, so you can just forward misclassified ham rather than the original message. By default, B rebuilds the Bayesian database after learning all the messages. This takes some time, so it is probably sensible to combine all misclassified spam into one message before forwarding it to Exyz+ sa-learn-spam@myhome.orgE. B uses the B package to parse attachments whereas SpamAssassin does not depend on B. =head1 SEE ALSO sa-learn(1) Mail::SpamAssassin(3) spamassassin(1) =head1 AUTHOR Bill Clarke (/lib) Ellib /at/ computer.orgE with huge swathes of code taken directly from B by Justin Mason. =cut