Files
zira-etc/mail/spamassassin/abc/BayesOCR_PLG.pm
2021-05-24 22:18:33 +03:00

401 lines
12 KiB
Perl

#*************************************************************************
# Bayes OCR Plugin, version 0.1
#*************************************************************************
# Copyright 2007 P.R.A. Group - D.I.E.E. - University of Cagliari (ITA)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#*************************************************************************
package BayesOCR_PLG;
use strict;
use Mail::SpamAssassin;
use Mail::SpamAssassin::Util;
use Mail::SpamAssassin::Plugin;
use Mail::SpamAssassin::Logger;
our @ISA = qw (Mail::SpamAssassin::Plugin);
# constructor: register the eval rule
sub new {
my ( $class, $mailsa ) = @_;
$class = ref($class) || $class;
my $self = $class->SUPER::new($mailsa);
bless( $self, $class );
dbg("PLG-BayesOCR:: new:: register_eval_rule");
$self->register_eval_rule("BayesOCR_check");
$self->{'imgTxt_classifierOut'} = -1;
$self->{'imgTxt_tagmsg'} = ""; #msg to be saved in e-mail tag when $self->{'imgTxt_classifierOut'} <= 0
return $self;
}
#===========================================================================
#===========================================================================
sub check_start{
# Called before eval rule
my ( $self, $pms ) = @_;
dbg("PLG-BayesOCR:: check_start:: init score");
#Init outNB_imgTxt
$self->{'imgTxt_classifierOut'} = -1;
$self->{'imgTxt_tagmsg'} = "";
}
sub isValidUser{
my ($pms) = @_;
my $username = $pms->{main}->{username};
dbg("PLG-BayesOCR:: isValidUser:: Username: $username");
return 1;
}
sub BayesOCR_check {
# BayesOCR_check(thr)
# Return an hit when (outNB > thr)
# The score is computed as (weigth * outNB)
#
my ($self, $pms, $unused, $thrL, $thrH) = @_;
my $plgRuleName = $pms->get_current_eval_rule_name();
#if( isValidUser($pms) == 0) { return 0; }
dbg("PLG-BayesOCR:: BayesOCR_check :: Rule: $plgRuleName");
dbg("PLG-BayesOCR:: BayesOCR_check :: thr: ($thrH, $thrL)");
if($self->{'imgTxt_classifierOut'} < 0)
{
#Output
if( $self->imageSpam_OCRTextProcessing($pms ) )
{
$self->{'imgTxt_tagmsg'} = $self->{'imgTxt_classifierOut'};
}
dbg("PLG-BayesOCR:: BayesOCR_check:: Write Mail Header\n\n");
$pms->set_tag ("PLGBAYESOCROUT", $self->{'imgTxt_tagmsg'} );
}
my $resHit = ($self->{'imgTxt_classifierOut'} > $thrL) && ($self->{'imgTxt_classifierOut'} <= $thrH );
return $resHit;
}
1;
#===========================================================================
sub imageSpam_OCRTextProcessing
# boolen $self->imageSpam_OCRTextProcessing($pms)
#
# imageSpam processing by image's text analisys with SA's NaiveBayes
# return 1 : (sucess) image's text has beeen extract and processed by NB
# return 0 : (failed) no images, no text, no NB.
{
my ( $self, $pms ) = @_;
# $self :: Obj Plugin
# $pms :: Obj Mail::SpamAssassin::PerMsgStatus
# $pms->{msg} :: message of class Mail::SpamAssassin::Message
#================================
# Init result
#================================
$self->{'imgTxt_classifierOut'} = 0;
#================================
# Check & Create Classifier
#================================
my $nbSA = $pms->{main}->{bayes_scanner};
#my $nbSA = new Mail::SpamAssassin::Bayes ($pms->{main});
if( $nbSA->is_scan_available() == 0)
{
dbg("PLG-BayesOCR:: imageTextClassifierOutEstimation: NB scan not available");
$self->{'imgTxt_tagmsg'} = "0.0 (NaiveBayes not available)";
return 0;
}
#================================
# Image extraction
#================================
dbg("PLG-BayesOCR:: imageSpam_OCRTextProcessing:: Check for Attached Images");
my ($imgTextOcr, $numImages) = imageTextExtractionFromMSG($pms->{msg});
if($numImages == 0)
{
$self->{'imgTxt_tagmsg'} = "0.0 (No images found)";
return 0;
}
# Check extracted text
my $numWord = 0;
while($imgTextOcr =~ /[a-z]{3,}/gi)
{
$numWord++;
}
dbg("PLG-BayesOCR:: imageSpam_OCRTextProcessing:: $numWord words (3+ chars) recognised");
if($numWord <= 3)
{
$self->{'imgTxt_tagmsg'} = "0.0 (No usefull text found)";
return 0;
}
#================================
# Classifier's output estimation
#================================
# creation of msg with image's text
my $mailraw = createMSGFromText($pms, $imgTextOcr);
my $msgTmp = $pms->{main}->parse($mailraw,1);
dbg("PLG-BayesOCR:: imageSpam_OCRTextProcessing:: Compute score with trained NaiveBayes");
my $pmsTMP = new Mail::SpamAssassin::PerMsgStatus($pms->{main}, $msgTmp);
# Classification
my $outNB = $nbSA->scan($pmsTMP, $msgTmp);
$self->{'imgTxt_classifierOut'} = sprintf("%0.3f", $outNB);
dbg("PLG-BayesOCR:: imageSpam_OCRTextProcessing:: classifier's out = $self->{'imgTxt_classifierOut'}" );
return 1; # All OK
}
#===========================================================================
sub imageTextExtractionFromMSG
# ($imgTextOcr, $numImages) = imageTextExtractionFromMSG($msg)
# Extract the text from all attached images
# Return all text anche the number of attached images
{
my $msg = $_[0];
dbg("PLG-BayesOCR:: imageTextExtractionFromMSG:: Extract & Convert Images");
my @mimeStr = ("image/*", "img/*");
my @tmpImgFile;
my $num=0;
my $imgTextOcr = "";
foreach (@mimeStr)
{
# Search all attach with current MIME
my @img_parts = $msg->find_parts($_);
for (my $i=0; $i <= $#img_parts; $i++)
{
my $imagestream = $img_parts[$i]->decode(1048000); # ~ 1 MB
$imgTextOcr = join $imgTextOcr, imageTextExtractionByOCR($imagestream), "\n";
$num++;
}
}
dbg("PLG-BayesOCR:: imageTextExtractionFromMSG:: $num images extracted");
return ($imgTextOcr, $num);
}
#===========================================================================
sub imageTextExtractionByOCR
# $textOut = imageTextExtractionByOCR( $imagestream )
# Text extraction from imge file "" by OCR engine
{
my $imagestream = $_[0];
my $imagelen = length($imagestream) / 1024;
my $tmpDir = "/tmp"; #Get tmp dir
my $tmpFile = "$tmpDir/sa_bayesOCR_tmpImg.$$";
# Zooming small images could improve OCR accuracy
# Byte Check
# > 1000K => no OCR
# < 15K => OCR + zoom 4X
# else => Check resolution
# Check resolution
# res > 1400x1050 => no OCR
# 1024x768 <= res < 1400x1050 => OCR (no zoom)
# 800x600 <= res < 1024x768 => OCR + zoom 2X
# res < 800x600 => OCR + zoom 4X
if ($imagelen > 1000)
{
dbg("PLG-BayesOCR:: imageTextExtractionByOCR:: Skip, image size = $imagelen");
return "";
}
open (FILE, ">$tmpFile.tmp") or return "";
print FILE "$imagestream \n";
close FILE;
my $convertOPT = "";
my $imageIdentifyTxt = "";
if($imagelen < 20 )
{
dbg("PLG-BayesOCR:: imageTextExtractionByOCR:: Enable zoom 4X");
$convertOPT = "-sample 400% -density 280";
}
else
{
dbg("PLG-BayesOCR:: imageTextExtractionByOCR:: Check image dim");
# check WxH
open EXEFH, "identify -quiet -ping $tmpFile.tmp |";
$imageIdentifyTxt = join "", <EXEFH>;
close EXEFH;
if( $imageIdentifyTxt =~ s/\s(\d*)x(\d*)\s//i )
{
my $size1 = $1;
my $size2 = $2;
if($size1 * $size2 > 1400*1050 && $size1 > 1280 && $size2 > 1024)
{
dbg("PLG-BayesOCR:: imageTextExtractionByOCR:: Skip, image dim = $size1 x $size2");
unlink "$tmpFile.tmp";
return "";
}
if( $size1 * $size2 < 800*600)
{
dbg("PLG-BayesOCR:: imageTextExtractionByOCR:: Enable zoom 4X");
$convertOPT = "-sample 400% -density 280";
}
elsif( $size1 * $size2 < 1024*768)
{
dbg("PLG-BayesOCR:: imageTextExtractionByOCR:: Enable zoom 2X");
$convertOPT = "-sample 200% -density 280";
}
}
}
dbg("PLG-BayesOCR:: imageTextExtractionByOCR:: Convert & OCR");
# -append :: concatenate image i layers
# -flatten :: fuse layers
# -density :: set dpi
my $exstatus = system("convert $tmpFile.tmp -append -flatten $convertOPT $tmpFile.pnm");
if($exstatus != 0)
{
dbg("PLG-BayesOCR:: imageTextExtractionByOCR:: Convert ERROR!!");
#Catturo SDOUT e STERR
open EXEFH, "identify -verbose -strip $tmpFile.tmp 2>&1 |";
$imageIdentifyTxt = join "", <EXEFH>;
close EXEFH;
my $msg = "Stream size (kb): $imagelen\nIdentify output: \n$imageIdentifyTxt\n";
saveLogMsg($tmpDir, "Convert Error", $msg);
unlink "$tmpFile.tmp";
return "";
}
# GOCR call with timeout (thanks to B. Austin for the usefull suggestions)
my $textOut = "";
eval {
local $SIG{ALRM} = sub { die "GOCR_TIMEOUT\n" };
alarm 10;
# Retrieve gocr output
open EXEFH, "gocr $tmpFile.pnm |";
$textOut = join "", <EXEFH>;
close EXEFH;
alarm 0;
};
if ($@) {
die unless $@ eq "GOCR_TIMEOUT\n"; # propagate unexpected errors
# timed out
dbg("PLG-BayesOCR:: imageTextExtractionByOCR:: OCR timeout!!");
# Extract the list of all child of this process
open PSFH, "ps -o pid,cmd --ppid $$ |";
my $psOut = join "", <PSFH>;
close PSFH;
#Get the PID of gocr child
if( $psOut =~ s/(\d*) gocr//i)
{
kill 9, $1;
}
my $msg = "Stream size (kb): $imagelen\nPS out:\n $psOut\n";
saveLogMsg($tmpDir, "OCR timeout", $msg);
$textOut = "";
}
unlink "$tmpFile.tmp";
unlink "$tmpFile.pnm";
return $textOut;
}
#===========================================================================
sub createMSGFromText
# msg = createMSGFromText(@img_ocrText)
{
my ($pms, $ocrText) = @_;
dbg("PLG-BayesOCR: createMSGFromText:: Make temp email with OCR's text");
my $subject = "";
my $date = $pms->{msg}->get_pristine_header("Date");
my $from = ""; #$pms->{msg}->get_pristine_header("From");
my $to = ""; #$pms->{msg}->get_pristine_header("To");
my $mailraw = "From: $from\nTo: $to\nSubject: $subject\nDate: $date\nContent-Type: text/plain;\n charset=\"us-ascii\"\nContent-Disposition: inline\n\n$ocrText\n";
return $mailraw
}
#===========================================================================
#===========================================================================
sub saveLogMsg()
{
my ($tmpDir, $title, $msg) = @_;
my $timenow = localtime time;
open (FILE, ">>$tmpDir/sa_bayesOCR.log");
print FILE "#--------------------------------\n";
print FILE " $timenow\n";
print FILE " $title\n";
print FILE "#--------------------------------\n";
print FILE "$msg\n";
close FILE;
}
#===========================================================================