#************************************************************************* # Bayes OCR Plugin, version 0.1 #************************************************************************* # Copyright 2007 P.R.A. Group - D.I.E.E. - University of Cagliari (ITA) # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. #************************************************************************* package BayesOCR_PLG; use strict; use Mail::SpamAssassin; use Mail::SpamAssassin::Util; use Mail::SpamAssassin::Plugin; use Mail::SpamAssassin::Logger; our @ISA = qw (Mail::SpamAssassin::Plugin); # constructor: register the eval rule sub new { my ( $class, $mailsa ) = @_; $class = ref($class) || $class; my $self = $class->SUPER::new($mailsa); bless( $self, $class ); dbg("PLG-BayesOCR:: new:: register_eval_rule"); $self->register_eval_rule("BayesOCR_check"); $self->{'imgTxt_classifierOut'} = -1; $self->{'imgTxt_tagmsg'} = ""; #msg to be saved in e-mail tag when $self->{'imgTxt_classifierOut'} <= 0 return $self; } #=========================================================================== #=========================================================================== sub check_start{ # Called before eval rule my ( $self, $pms ) = @_; dbg("PLG-BayesOCR:: check_start:: init score"); #Init outNB_imgTxt $self->{'imgTxt_classifierOut'} = -1; $self->{'imgTxt_tagmsg'} = ""; } sub isValidUser{ my ($pms) = @_; my $username = $pms->{main}->{username}; dbg("PLG-BayesOCR:: isValidUser:: Username: $username"); return 1; } sub BayesOCR_check { # BayesOCR_check(thr) # Return an hit when (outNB > thr) # The score is computed as (weigth * outNB) # my ($self, $pms, $unused, $thrL, $thrH) = @_; my $plgRuleName = $pms->get_current_eval_rule_name(); #if( isValidUser($pms) == 0) { return 0; } dbg("PLG-BayesOCR:: BayesOCR_check :: Rule: $plgRuleName"); dbg("PLG-BayesOCR:: BayesOCR_check :: thr: ($thrH, $thrL)"); if($self->{'imgTxt_classifierOut'} < 0) { #Output if( $self->imageSpam_OCRTextProcessing($pms ) ) { $self->{'imgTxt_tagmsg'} = $self->{'imgTxt_classifierOut'}; } dbg("PLG-BayesOCR:: BayesOCR_check:: Write Mail Header\n\n"); $pms->set_tag ("PLGBAYESOCROUT", $self->{'imgTxt_tagmsg'} ); } my $resHit = ($self->{'imgTxt_classifierOut'} > $thrL) && ($self->{'imgTxt_classifierOut'} <= $thrH ); return $resHit; } 1; #=========================================================================== sub imageSpam_OCRTextProcessing # boolen $self->imageSpam_OCRTextProcessing($pms) # # imageSpam processing by image's text analisys with SA's NaiveBayes # return 1 : (sucess) image's text has beeen extract and processed by NB # return 0 : (failed) no images, no text, no NB. { my ( $self, $pms ) = @_; # $self :: Obj Plugin # $pms :: Obj Mail::SpamAssassin::PerMsgStatus # $pms->{msg} :: message of class Mail::SpamAssassin::Message #================================ # Init result #================================ $self->{'imgTxt_classifierOut'} = 0; #================================ # Check & Create Classifier #================================ my $nbSA = $pms->{main}->{bayes_scanner}; #my $nbSA = new Mail::SpamAssassin::Bayes ($pms->{main}); if( $nbSA->is_scan_available() == 0) { dbg("PLG-BayesOCR:: imageTextClassifierOutEstimation: NB scan not available"); $self->{'imgTxt_tagmsg'} = "0.0 (NaiveBayes not available)"; return 0; } #================================ # Image extraction #================================ dbg("PLG-BayesOCR:: imageSpam_OCRTextProcessing:: Check for Attached Images"); my ($imgTextOcr, $numImages) = imageTextExtractionFromMSG($pms->{msg}); if($numImages == 0) { $self->{'imgTxt_tagmsg'} = "0.0 (No images found)"; return 0; } # Check extracted text my $numWord = 0; while($imgTextOcr =~ /[a-z]{3,}/gi) { $numWord++; } dbg("PLG-BayesOCR:: imageSpam_OCRTextProcessing:: $numWord words (3+ chars) recognised"); if($numWord <= 3) { $self->{'imgTxt_tagmsg'} = "0.0 (No usefull text found)"; return 0; } #================================ # Classifier's output estimation #================================ # creation of msg with image's text my $mailraw = createMSGFromText($pms, $imgTextOcr); my $msgTmp = $pms->{main}->parse($mailraw,1); dbg("PLG-BayesOCR:: imageSpam_OCRTextProcessing:: Compute score with trained NaiveBayes"); my $pmsTMP = new Mail::SpamAssassin::PerMsgStatus($pms->{main}, $msgTmp); # Classification my $outNB = $nbSA->scan($pmsTMP, $msgTmp); $self->{'imgTxt_classifierOut'} = sprintf("%0.3f", $outNB); dbg("PLG-BayesOCR:: imageSpam_OCRTextProcessing:: classifier's out = $self->{'imgTxt_classifierOut'}" ); return 1; # All OK } #=========================================================================== sub imageTextExtractionFromMSG # ($imgTextOcr, $numImages) = imageTextExtractionFromMSG($msg) # Extract the text from all attached images # Return all text anche the number of attached images { my $msg = $_[0]; dbg("PLG-BayesOCR:: imageTextExtractionFromMSG:: Extract & Convert Images"); my @mimeStr = ("image/*", "img/*"); my @tmpImgFile; my $num=0; my $imgTextOcr = ""; foreach (@mimeStr) { # Search all attach with current MIME my @img_parts = $msg->find_parts($_); for (my $i=0; $i <= $#img_parts; $i++) { my $imagestream = $img_parts[$i]->decode(1048000); # ~ 1 MB $imgTextOcr = join $imgTextOcr, imageTextExtractionByOCR($imagestream), "\n"; $num++; } } dbg("PLG-BayesOCR:: imageTextExtractionFromMSG:: $num images extracted"); return ($imgTextOcr, $num); } #=========================================================================== sub imageTextExtractionByOCR # $textOut = imageTextExtractionByOCR( $imagestream ) # Text extraction from imge file "" by OCR engine { my $imagestream = $_[0]; my $imagelen = length($imagestream) / 1024; my $tmpDir = "/tmp"; #Get tmp dir my $tmpFile = "$tmpDir/sa_bayesOCR_tmpImg.$$"; # Zooming small images could improve OCR accuracy # Byte Check # > 1000K => no OCR # < 15K => OCR + zoom 4X # else => Check resolution # Check resolution # res > 1400x1050 => no OCR # 1024x768 <= res < 1400x1050 => OCR (no zoom) # 800x600 <= res < 1024x768 => OCR + zoom 2X # res < 800x600 => OCR + zoom 4X if ($imagelen > 1000) { dbg("PLG-BayesOCR:: imageTextExtractionByOCR:: Skip, image size = $imagelen"); return ""; } open (FILE, ">$tmpFile.tmp") or return ""; print FILE "$imagestream \n"; close FILE; my $convertOPT = ""; my $imageIdentifyTxt = ""; if($imagelen < 20 ) { dbg("PLG-BayesOCR:: imageTextExtractionByOCR:: Enable zoom 4X"); $convertOPT = "-sample 400% -density 280"; } else { dbg("PLG-BayesOCR:: imageTextExtractionByOCR:: Check image dim"); # check WxH open EXEFH, "identify -quiet -ping $tmpFile.tmp |"; $imageIdentifyTxt = join "", ; close EXEFH; if( $imageIdentifyTxt =~ s/\s(\d*)x(\d*)\s//i ) { my $size1 = $1; my $size2 = $2; if($size1 * $size2 > 1400*1050 && $size1 > 1280 && $size2 > 1024) { dbg("PLG-BayesOCR:: imageTextExtractionByOCR:: Skip, image dim = $size1 x $size2"); unlink "$tmpFile.tmp"; return ""; } if( $size1 * $size2 < 800*600) { dbg("PLG-BayesOCR:: imageTextExtractionByOCR:: Enable zoom 4X"); $convertOPT = "-sample 400% -density 280"; } elsif( $size1 * $size2 < 1024*768) { dbg("PLG-BayesOCR:: imageTextExtractionByOCR:: Enable zoom 2X"); $convertOPT = "-sample 200% -density 280"; } } } dbg("PLG-BayesOCR:: imageTextExtractionByOCR:: Convert & OCR"); # -append :: concatenate image i layers # -flatten :: fuse layers # -density :: set dpi my $exstatus = system("convert $tmpFile.tmp -append -flatten $convertOPT $tmpFile.pnm"); if($exstatus != 0) { dbg("PLG-BayesOCR:: imageTextExtractionByOCR:: Convert ERROR!!"); #Catturo SDOUT e STERR open EXEFH, "identify -verbose -strip $tmpFile.tmp 2>&1 |"; $imageIdentifyTxt = join "", ; close EXEFH; my $msg = "Stream size (kb): $imagelen\nIdentify output: \n$imageIdentifyTxt\n"; saveLogMsg($tmpDir, "Convert Error", $msg); unlink "$tmpFile.tmp"; return ""; } # GOCR call with timeout (thanks to B. Austin for the usefull suggestions) my $textOut = ""; eval { local $SIG{ALRM} = sub { die "GOCR_TIMEOUT\n" }; alarm 10; # Retrieve gocr output open EXEFH, "gocr $tmpFile.pnm |"; $textOut = join "", ; close EXEFH; alarm 0; }; if ($@) { die unless $@ eq "GOCR_TIMEOUT\n"; # propagate unexpected errors # timed out dbg("PLG-BayesOCR:: imageTextExtractionByOCR:: OCR timeout!!"); # Extract the list of all child of this process open PSFH, "ps -o pid,cmd --ppid $$ |"; my $psOut = join "", ; close PSFH; #Get the PID of gocr child if( $psOut =~ s/(\d*) gocr//i) { kill 9, $1; } my $msg = "Stream size (kb): $imagelen\nPS out:\n $psOut\n"; saveLogMsg($tmpDir, "OCR timeout", $msg); $textOut = ""; } unlink "$tmpFile.tmp"; unlink "$tmpFile.pnm"; return $textOut; } #=========================================================================== sub createMSGFromText # msg = createMSGFromText(@img_ocrText) { my ($pms, $ocrText) = @_; dbg("PLG-BayesOCR: createMSGFromText:: Make temp email with OCR's text"); my $subject = ""; my $date = $pms->{msg}->get_pristine_header("Date"); my $from = ""; #$pms->{msg}->get_pristine_header("From"); my $to = ""; #$pms->{msg}->get_pristine_header("To"); my $mailraw = "From: $from\nTo: $to\nSubject: $subject\nDate: $date\nContent-Type: text/plain;\n charset=\"us-ascii\"\nContent-Disposition: inline\n\n$ocrText\n"; return $mailraw } #=========================================================================== #=========================================================================== sub saveLogMsg() { my ($tmpDir, $title, $msg) = @_; my $timenow = localtime time; open (FILE, ">>$tmpDir/sa_bayesOCR.log"); print FILE "#--------------------------------\n"; print FILE " $timenow\n"; print FILE " $title\n"; print FILE "#--------------------------------\n"; print FILE "$msg\n"; close FILE; } #===========================================================================