401 lines
12 KiB
Perl
401 lines
12 KiB
Perl
#*************************************************************************
|
|
# Bayes OCR Plugin, version 0.1
|
|
#*************************************************************************
|
|
# Copyright 2007 P.R.A. Group - D.I.E.E. - University of Cagliari (ITA)
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
#*************************************************************************
|
|
|
|
package BayesOCR_PLG;
|
|
|
|
use strict;
|
|
use Mail::SpamAssassin;
|
|
use Mail::SpamAssassin::Util;
|
|
use Mail::SpamAssassin::Plugin;
|
|
use Mail::SpamAssassin::Logger;
|
|
|
|
our @ISA = qw (Mail::SpamAssassin::Plugin);
|
|
|
|
# constructor: register the eval rule
|
|
sub new {
|
|
my ( $class, $mailsa ) = @_;
|
|
$class = ref($class) || $class;
|
|
my $self = $class->SUPER::new($mailsa);
|
|
bless( $self, $class );
|
|
dbg("PLG-BayesOCR:: new:: register_eval_rule");
|
|
|
|
$self->register_eval_rule("BayesOCR_check");
|
|
$self->{'imgTxt_classifierOut'} = -1;
|
|
$self->{'imgTxt_tagmsg'} = ""; #msg to be saved in e-mail tag when $self->{'imgTxt_classifierOut'} <= 0
|
|
|
|
return $self;
|
|
}
|
|
|
|
#===========================================================================
|
|
#===========================================================================
|
|
|
|
sub check_start{
|
|
# Called before eval rule
|
|
my ( $self, $pms ) = @_;
|
|
dbg("PLG-BayesOCR:: check_start:: init score");
|
|
|
|
#Init outNB_imgTxt
|
|
$self->{'imgTxt_classifierOut'} = -1;
|
|
$self->{'imgTxt_tagmsg'} = "";
|
|
}
|
|
|
|
sub isValidUser{
|
|
my ($pms) = @_;
|
|
my $username = $pms->{main}->{username};
|
|
dbg("PLG-BayesOCR:: isValidUser:: Username: $username");
|
|
|
|
return 1;
|
|
}
|
|
|
|
|
|
sub BayesOCR_check {
|
|
# BayesOCR_check(thr)
|
|
# Return an hit when (outNB > thr)
|
|
# The score is computed as (weigth * outNB)
|
|
#
|
|
my ($self, $pms, $unused, $thrL, $thrH) = @_;
|
|
my $plgRuleName = $pms->get_current_eval_rule_name();
|
|
|
|
#if( isValidUser($pms) == 0) { return 0; }
|
|
|
|
dbg("PLG-BayesOCR:: BayesOCR_check :: Rule: $plgRuleName");
|
|
dbg("PLG-BayesOCR:: BayesOCR_check :: thr: ($thrH, $thrL)");
|
|
|
|
|
|
if($self->{'imgTxt_classifierOut'} < 0)
|
|
{
|
|
#Output
|
|
if( $self->imageSpam_OCRTextProcessing($pms ) )
|
|
{
|
|
$self->{'imgTxt_tagmsg'} = $self->{'imgTxt_classifierOut'};
|
|
}
|
|
|
|
dbg("PLG-BayesOCR:: BayesOCR_check:: Write Mail Header\n\n");
|
|
$pms->set_tag ("PLGBAYESOCROUT", $self->{'imgTxt_tagmsg'} );
|
|
}
|
|
my $resHit = ($self->{'imgTxt_classifierOut'} > $thrL) && ($self->{'imgTxt_classifierOut'} <= $thrH );
|
|
|
|
return $resHit;
|
|
}
|
|
|
|
1;
|
|
|
|
#===========================================================================
|
|
|
|
sub imageSpam_OCRTextProcessing
|
|
# boolen $self->imageSpam_OCRTextProcessing($pms)
|
|
#
|
|
# imageSpam processing by image's text analisys with SA's NaiveBayes
|
|
# return 1 : (sucess) image's text has beeen extract and processed by NB
|
|
# return 0 : (failed) no images, no text, no NB.
|
|
{
|
|
my ( $self, $pms ) = @_;
|
|
# $self :: Obj Plugin
|
|
# $pms :: Obj Mail::SpamAssassin::PerMsgStatus
|
|
# $pms->{msg} :: message of class Mail::SpamAssassin::Message
|
|
|
|
#================================
|
|
# Init result
|
|
#================================
|
|
$self->{'imgTxt_classifierOut'} = 0;
|
|
|
|
#================================
|
|
# Check & Create Classifier
|
|
#================================
|
|
|
|
my $nbSA = $pms->{main}->{bayes_scanner};
|
|
#my $nbSA = new Mail::SpamAssassin::Bayes ($pms->{main});
|
|
|
|
if( $nbSA->is_scan_available() == 0)
|
|
{
|
|
dbg("PLG-BayesOCR:: imageTextClassifierOutEstimation: NB scan not available");
|
|
$self->{'imgTxt_tagmsg'} = "0.0 (NaiveBayes not available)";
|
|
return 0;
|
|
}
|
|
|
|
#================================
|
|
# Image extraction
|
|
#================================
|
|
dbg("PLG-BayesOCR:: imageSpam_OCRTextProcessing:: Check for Attached Images");
|
|
my ($imgTextOcr, $numImages) = imageTextExtractionFromMSG($pms->{msg});
|
|
|
|
if($numImages == 0)
|
|
{
|
|
$self->{'imgTxt_tagmsg'} = "0.0 (No images found)";
|
|
return 0;
|
|
}
|
|
|
|
# Check extracted text
|
|
my $numWord = 0;
|
|
while($imgTextOcr =~ /[a-z]{3,}/gi)
|
|
{
|
|
$numWord++;
|
|
}
|
|
dbg("PLG-BayesOCR:: imageSpam_OCRTextProcessing:: $numWord words (3+ chars) recognised");
|
|
|
|
|
|
if($numWord <= 3)
|
|
{
|
|
$self->{'imgTxt_tagmsg'} = "0.0 (No usefull text found)";
|
|
return 0;
|
|
}
|
|
|
|
#================================
|
|
# Classifier's output estimation
|
|
#================================
|
|
|
|
# creation of msg with image's text
|
|
my $mailraw = createMSGFromText($pms, $imgTextOcr);
|
|
my $msgTmp = $pms->{main}->parse($mailraw,1);
|
|
|
|
dbg("PLG-BayesOCR:: imageSpam_OCRTextProcessing:: Compute score with trained NaiveBayes");
|
|
my $pmsTMP = new Mail::SpamAssassin::PerMsgStatus($pms->{main}, $msgTmp);
|
|
|
|
# Classification
|
|
my $outNB = $nbSA->scan($pmsTMP, $msgTmp);
|
|
$self->{'imgTxt_classifierOut'} = sprintf("%0.3f", $outNB);
|
|
|
|
dbg("PLG-BayesOCR:: imageSpam_OCRTextProcessing:: classifier's out = $self->{'imgTxt_classifierOut'}" );
|
|
|
|
return 1; # All OK
|
|
}
|
|
|
|
|
|
#===========================================================================
|
|
|
|
sub imageTextExtractionFromMSG
|
|
# ($imgTextOcr, $numImages) = imageTextExtractionFromMSG($msg)
|
|
# Extract the text from all attached images
|
|
# Return all text anche the number of attached images
|
|
{
|
|
my $msg = $_[0];
|
|
|
|
dbg("PLG-BayesOCR:: imageTextExtractionFromMSG:: Extract & Convert Images");
|
|
|
|
my @mimeStr = ("image/*", "img/*");
|
|
my @tmpImgFile;
|
|
my $num=0;
|
|
|
|
my $imgTextOcr = "";
|
|
|
|
foreach (@mimeStr)
|
|
{
|
|
# Search all attach with current MIME
|
|
my @img_parts = $msg->find_parts($_);
|
|
for (my $i=0; $i <= $#img_parts; $i++)
|
|
{
|
|
my $imagestream = $img_parts[$i]->decode(1048000); # ~ 1 MB
|
|
$imgTextOcr = join $imgTextOcr, imageTextExtractionByOCR($imagestream), "\n";
|
|
|
|
$num++;
|
|
}
|
|
}
|
|
|
|
dbg("PLG-BayesOCR:: imageTextExtractionFromMSG:: $num images extracted");
|
|
return ($imgTextOcr, $num);
|
|
}
|
|
|
|
#===========================================================================
|
|
|
|
sub imageTextExtractionByOCR
|
|
# $textOut = imageTextExtractionByOCR( $imagestream )
|
|
# Text extraction from imge file "" by OCR engine
|
|
{
|
|
my $imagestream = $_[0];
|
|
my $imagelen = length($imagestream) / 1024;
|
|
my $tmpDir = "/tmp"; #Get tmp dir
|
|
my $tmpFile = "$tmpDir/sa_bayesOCR_tmpImg.$$";
|
|
|
|
# Zooming small images could improve OCR accuracy
|
|
|
|
# Byte Check
|
|
# > 1000K => no OCR
|
|
# < 15K => OCR + zoom 4X
|
|
# else => Check resolution
|
|
|
|
# Check resolution
|
|
# res > 1400x1050 => no OCR
|
|
# 1024x768 <= res < 1400x1050 => OCR (no zoom)
|
|
# 800x600 <= res < 1024x768 => OCR + zoom 2X
|
|
# res < 800x600 => OCR + zoom 4X
|
|
|
|
if ($imagelen > 1000)
|
|
{
|
|
dbg("PLG-BayesOCR:: imageTextExtractionByOCR:: Skip, image size = $imagelen");
|
|
return "";
|
|
}
|
|
|
|
open (FILE, ">$tmpFile.tmp") or return "";
|
|
print FILE "$imagestream \n";
|
|
close FILE;
|
|
|
|
my $convertOPT = "";
|
|
my $imageIdentifyTxt = "";
|
|
if($imagelen < 20 )
|
|
{
|
|
dbg("PLG-BayesOCR:: imageTextExtractionByOCR:: Enable zoom 4X");
|
|
$convertOPT = "-sample 400% -density 280";
|
|
}
|
|
else
|
|
{
|
|
dbg("PLG-BayesOCR:: imageTextExtractionByOCR:: Check image dim");
|
|
|
|
# check WxH
|
|
open EXEFH, "identify -quiet -ping $tmpFile.tmp |";
|
|
$imageIdentifyTxt = join "", <EXEFH>;
|
|
close EXEFH;
|
|
|
|
if( $imageIdentifyTxt =~ s/\s(\d*)x(\d*)\s//i )
|
|
{
|
|
my $size1 = $1;
|
|
my $size2 = $2;
|
|
|
|
if($size1 * $size2 > 1400*1050 && $size1 > 1280 && $size2 > 1024)
|
|
{
|
|
dbg("PLG-BayesOCR:: imageTextExtractionByOCR:: Skip, image dim = $size1 x $size2");
|
|
unlink "$tmpFile.tmp";
|
|
return "";
|
|
}
|
|
|
|
if( $size1 * $size2 < 800*600)
|
|
{
|
|
dbg("PLG-BayesOCR:: imageTextExtractionByOCR:: Enable zoom 4X");
|
|
$convertOPT = "-sample 400% -density 280";
|
|
}
|
|
elsif( $size1 * $size2 < 1024*768)
|
|
{
|
|
dbg("PLG-BayesOCR:: imageTextExtractionByOCR:: Enable zoom 2X");
|
|
$convertOPT = "-sample 200% -density 280";
|
|
}
|
|
}
|
|
}
|
|
|
|
dbg("PLG-BayesOCR:: imageTextExtractionByOCR:: Convert & OCR");
|
|
# -append :: concatenate image i layers
|
|
# -flatten :: fuse layers
|
|
# -density :: set dpi
|
|
|
|
my $exstatus = system("convert $tmpFile.tmp -append -flatten $convertOPT $tmpFile.pnm");
|
|
if($exstatus != 0)
|
|
{
|
|
dbg("PLG-BayesOCR:: imageTextExtractionByOCR:: Convert ERROR!!");
|
|
#Catturo SDOUT e STERR
|
|
open EXEFH, "identify -verbose -strip $tmpFile.tmp 2>&1 |";
|
|
$imageIdentifyTxt = join "", <EXEFH>;
|
|
close EXEFH;
|
|
|
|
my $msg = "Stream size (kb): $imagelen\nIdentify output: \n$imageIdentifyTxt\n";
|
|
saveLogMsg($tmpDir, "Convert Error", $msg);
|
|
|
|
unlink "$tmpFile.tmp";
|
|
return "";
|
|
}
|
|
|
|
# GOCR call with timeout (thanks to B. Austin for the usefull suggestions)
|
|
my $textOut = "";
|
|
eval {
|
|
local $SIG{ALRM} = sub { die "GOCR_TIMEOUT\n" };
|
|
alarm 10;
|
|
|
|
# Retrieve gocr output
|
|
open EXEFH, "gocr $tmpFile.pnm |";
|
|
$textOut = join "", <EXEFH>;
|
|
close EXEFH;
|
|
|
|
alarm 0;
|
|
};
|
|
|
|
if ($@) {
|
|
die unless $@ eq "GOCR_TIMEOUT\n"; # propagate unexpected errors
|
|
|
|
# timed out
|
|
dbg("PLG-BayesOCR:: imageTextExtractionByOCR:: OCR timeout!!");
|
|
|
|
# Extract the list of all child of this process
|
|
open PSFH, "ps -o pid,cmd --ppid $$ |";
|
|
my $psOut = join "", <PSFH>;
|
|
close PSFH;
|
|
|
|
#Get the PID of gocr child
|
|
if( $psOut =~ s/(\d*) gocr//i)
|
|
{
|
|
kill 9, $1;
|
|
}
|
|
|
|
my $msg = "Stream size (kb): $imagelen\nPS out:\n $psOut\n";
|
|
saveLogMsg($tmpDir, "OCR timeout", $msg);
|
|
|
|
$textOut = "";
|
|
}
|
|
|
|
unlink "$tmpFile.tmp";
|
|
unlink "$tmpFile.pnm";
|
|
|
|
return $textOut;
|
|
}
|
|
|
|
#===========================================================================
|
|
|
|
sub createMSGFromText
|
|
# msg = createMSGFromText(@img_ocrText)
|
|
{
|
|
my ($pms, $ocrText) = @_;
|
|
dbg("PLG-BayesOCR: createMSGFromText:: Make temp email with OCR's text");
|
|
|
|
my $subject = "";
|
|
my $date = $pms->{msg}->get_pristine_header("Date");
|
|
my $from = ""; #$pms->{msg}->get_pristine_header("From");
|
|
my $to = ""; #$pms->{msg}->get_pristine_header("To");
|
|
|
|
|
|
my $mailraw = "From: $from\nTo: $to\nSubject: $subject\nDate: $date\nContent-Type: text/plain;\n charset=\"us-ascii\"\nContent-Disposition: inline\n\n$ocrText\n";
|
|
|
|
return $mailraw
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#===========================================================================
|
|
#===========================================================================
|
|
sub saveLogMsg()
|
|
{
|
|
my ($tmpDir, $title, $msg) = @_;
|
|
my $timenow = localtime time;
|
|
open (FILE, ">>$tmpDir/sa_bayesOCR.log");
|
|
|
|
print FILE "#--------------------------------\n";
|
|
print FILE " $timenow\n";
|
|
print FILE " $title\n";
|
|
print FILE "#--------------------------------\n";
|
|
print FILE "$msg\n";
|
|
|
|
close FILE;
|
|
}
|
|
#===========================================================================
|
|
|
|
|