#!/usr/bin/perl
#                              -*- Mode: Cperl -*-
# main_process.pl ---
# Author           : Manoj Srivastava ( srivasta@glaurung.internal.golden-gryphon.com )
# Created On       : Wed Nov  8 00:26:47 2006
# Created On Node  : glaurung.internal.golden-gryphon.com
# Last Modified By : Manoj Srivastava
# Last Modified On : Tue Dec 19 10:06:36 2006
# Last Machine Used: glaurung.internal.golden-gryphon.com
# Update Count     : 68
# Status           : Unknown, Use with caution!
# HISTORY          :
# Description      :
#
#

#  The idea here is to process a bunch pf mail folders, which are
#  supposed to contain mail which is either all ham or all spam,
#  indicated by the command line arguments. We go looking though every
#  mail, and any mail where either the crm114 or the spamasssin
#  judgement was non what we expected, we strip out mail geltering
#  headers, and then we save the mail, one to a file, and we train the
#  approprite filter.

($main::MYNAME     = $main::0) =~ s|.*/||;
$main::Author      = "Manoj Srivastava";
$main::AuthorMail  = "srivasta\@debian.org";

# The archive directory. Must have subdirs ham and spam, to archive
# messages we act upon
my $outdir = '/backup/classify/Done';
my $home = $ENV{'HOME'} || $ENV{'LOGDIR'} || (getpwuid($<))[7] ||
  die "You're homeless!\n";
my $crmdir = "$home/var/lib/crm114";

=head1 NAME

mail_process - process ham or spam

=cut

use strict;
require 5.002;
use Getopt::Long;
use Carp qw(carp croak);
use Fcntl ':flock';             # import LOCK_* constants

use Mail::Box::Manager;
=head1 SYNOPSIS

 usage: list-grey [options] mail-folder [mail-folder ...]

=cut

=head1 DESCRIPTION

The idea here is to process a bunch pf mail folders, which are
supposed to contain mail which is either all ham or all spam,
indicated by the command line arguments. We go looking though every
mail, and any mail where either the crm114 or the spamasssin judgement
was non what we expected, we strip out mail geltering headers, and
then we save the mail, one to a file, and we train the approprite
filter.


=head2 Normal Usage

The tested use case is one where the mail folders parsed are mbox
folders.

=cut

=head1 OPTIONS

=over 3

=item B<--help> Print out a usage message.

=cut

my $help_opt = '';
my $spam = '';
my $ham  = '';

my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday) = gmtime(time);
$year += 1900;
$mon++;
my $month = sprintf("%02d", $mon);
my $date  = sprintf("%02d", $mday);
my $hr    = sprintf("%02d", $hour);
my $minute= sprintf("%02d", $min);

my $prefix = "${year}.${month}.${date}.${hr}.${minute}";
my $num    = "00000001";

my @Filter_Headers = ('X-Grey', 'X-CRM114-Score', 'X-CRM114-Status',
                      'X-Spam-Status', 'X-SA-Orig', 'X-Spam-Value',
                      'X-SA-Rep', 'X-Scanned-By', 'X-Agent-list',
                      'X-MYLIST', 'X-Filter', 'X-Gnus-Mail-Source',
                      'X-Spam-Score', 'X-RealSpam', 'X-Spam');


my %option_ctl =
  (
   "help"                => \$help_opt,
   "spam"                => \$spam,
   "ham"                 => \$ham,
  );

my $usage = <<"EOUSAGE";
usage: $main::MYNAME [options] mail-folder [mail-folder ...]
    where the options are:
   --help              This message
 $main::Author <$main::AuthorMail>
EOUSAGE


sub lock_seq_number {
  my ($spooldir, $sequence_file_name, $lock_suffix) = @_;
  open (LOCK, ">${sequence_file_name}.${lock_suffix}") ||
    die "Could not open lock file:$!";
  flock(LOCK, LOCK_EX);
}

sub get_seq_number {
  my ($spooldir, $sequence_file_name) = @_;
  my $sequence = "00000000";
  open (SEQ, "${sequence_file_name}") ||
    die "Could not open sequence file:$!";
  while (<SEQ>) {
    chomp;

    s/#.*$//g;
    next if m/^\s*$/;

    m/^[\s]*(\S+)/go;
    $sequence = "" . $1;

    last;
  }
  close (SEQ);
  return $sequence;
}

sub set_seq_number {
  my %params = @_;
  croak ("Required argument Sequence Number not present") unless
    defined $params{'Sequence Number'};
  my $spooldir = $params{'Spool_Dir'};
  my $sequence_file_name = $params{'Sequence File'};

  open (SEQ, ">${sequence_file_name}") ||
    die "Could not open sequence file:$!";
  print SEQ $params{'Sequence Number'}, "\n";
  close (SEQ);
}

sub unlock_seq_number {
  my ($spooldir, $sequence_file_name, $lock_suffix) = @_;
  flock(LOCK, LOCK_UN);
  close (LOCK);
  # unlink "${sequence_file_name}.${lock_suffix}";
}

sub new_sequence_number {
  my ($spooldir, $sequence_file_name, $lock_suffix) = @_;

  my $new_seq  = "00000000";
  my $next_seq = "00000001";

  lock_seq_number($spooldir, $sequence_file_name, $lock_suffix);

  if (-f "$sequence_file_name") {
    $new_seq = get_seq_number($spooldir, $sequence_file_name);
    $next_seq = ++$new_seq;
  }
  set_seq_number('Sequence Number' => $next_seq,
                 'Spool_Dir' => $spooldir,
                 'Sequence File' => $sequence_file_name);
  unlock_seq_number($spooldir, $sequence_file_name, $lock_suffix);
  return $new_seq;
}



sub main() {
  my $ret;
  my $type    = '';
  my $crmtype = '';
  my $seq_type= '';

  $ret = GetOptions(%option_ctl);
  if(!$ret) {
    print "use --help to display command line syntax help.\n" ;
    exit 1;
  }
  if ($help_opt){
    print "$usage";
    exit 0;
  }

  if (!($spam || $ham)) {
    die "At least one of ham or spam must be specified\n";
  }
  if ($spam && $ham) {
    die "Only one of ham or spam must be specified\n";
  }
  $type    = "ham"          if $ham;
  $type    = "spam"         if $spam;
  $crmtype = "good"         if $ham; # nonspam, is using mailfilter
  $crmtype = "spam"         if $spam;
  $seq_type= "Ham_Seq_Num"  if $ham;
  $seq_type= "Spam_Seq_Num" if $spam;
  #  Get a new mail manager
  my $mgr     = Mail::Box::Manager->new;

  # Process folders given on the command line
  for my $name (@ARGV) {
    my $folder = $mgr->open(folder => "$name");
    warn "Examining folder '$folder' of type $type\n";
    # Iterate over the messages.
    foreach my $msg ($folder->messages) {
      my ($sa, $crm) = (undef,undef);
      my $head = $msg->head;
      my $save_mail = 0;
      my $train_sa  = 0;
      my $train_crm = 0;

      my $spam_values =  $head->get('X-SA-Orig');
      $spam_values    =~ s/\s+//g             if $spam_values;
      ($sa, $crm) = split (",", $spam_values) if $spam_values;
      undef $spam_values;
      # my $spam_score  =  $head->get('X-Spam-Value');
      # $spam_score     =~ s/\s+//g if $spam_score;
      my $crm_status  = $head->get('X-CRM114-Status');
      if ($crm_status =~ m/UNSURE/ig) {
        warn "TRAIN CRM\n";
          $train_crm++;
          $save_mail++;
      }

      if ($spam) {
        if (($crm && $crm < 0) || (! defined $crm)) {
          warn "TRAIN CRM\n";
          $train_crm++;
          $save_mail++;
        }
        if (($sa && $sa < 0) || (! defined $sa) ) {
          warn "TRAIN SA\n";
          $train_sa++;
          $save_mail++;
        }
      }
      else {
        if (($crm && $crm > 0) || (! defined $crm)) {
          warn "TRAIN CRM\n";
          $train_crm++;
          $save_mail++;
        }
        if (($sa && $sa > 0) || (! defined $sa) ) {
          warn "TRAIN SA\n";
          $train_sa++;
          $save_mail++;
        }
      }
      next unless $save_mail;


      foreach my $del_header (@Filter_Headers) {
        $head->delete($del_header);
      }
      my $sequence_number;
      $sequence_number = &new_sequence_number("$outdir/$type",
                                              "$outdir/$seq_type", 'LCK');
      my $outfile = "$outdir/$type/msg.${sequence_number}______${prefix}";
      warn "Outfile: $outfile\n";
      if ($save_mail) {
        my $outbox = $mgr->open(access => 'rw', create => 1,
                                folder => "$outfile",
                                type => 'mbox');
        $msg->copyTo($outbox,share => 1);
        $outbox->close;
      }
      if ($train_sa) {
        my @args = ("sa-learn", "--${type}", "--mbox", "$outfile");
        system(@args) == 0 or
          die "Could not open a pipe to sa-learn: $!";
      }
      # let mailreaver decide whether or not to train
      system("crm -u $crmdir mailreaver.crm --${crmtype} < $outfile | " .
             "egrep 'file reaver_cache|^X-CRM114-Action'")
        == 0 or die "Could not open a pipe to mailreaver.crm: $!";
#         my $body = "command doorbhash $crmtype\n\n";
#         $body   .= $msg->string . "\n";

#         my $crm_msg =
#           Mail::Message->build(
#                                From    => "Manoj Srivastava <srivasta>",
#                                To      => "testcrm",
#                                Subject => "Train CRM for $crmtype",
#                                data    => $body
#                               );
#         $crm_msg->send(via => 'sendmail');
    }
    $folder->close(write => 'NEVER');
  }
	#my @args = ("sa-learn", "--sync");
	#system(@args) == 0 or die "Could not open a pipe to sa-learn: $!";
  exit 0;
}

## Now just call main
&main();

=head1 RETURN VALUE

The program returns 0 on succesful completion, and non-zero exit
values otherwise.

=cut

=head1 ERRORS

=cut

=head1 DIAGNOSTICS

=cut

=head1 EXAMPLES

=cut

=head1 ENVIRONMENT

=cut


=head1 FILES

=cut

=head1 CAVEATS

=cut

=head1 BUGS

None known so far.

=cut

=head1 RESTRICTIONS

The input folders are meant to be mbox folders

=cut

=head1 NOTES

=cut

=head1 SEE ALSO

=cut

=head1  COPYRIGHT AND LICENSE

Copyright 2006 by Manoj Srivastava.

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.

This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
General Public License for more details.

=cut

=head1 AUTHOR

Manoj Srivastava C<srivasta\@debian.org>

=cut

exit 0;
__END__