#! /bin/zsh -f
#                               -*- Mode: Sh -*-
# init-train.sh ---
# Author           : Manoj Srivastava ( srivasta@anzu.internal.golden-gryphon.com )
# Created On       : Fri Nov 23 13:00:06 2007
# Created On Node  : anzu.internal.golden-gryphon.com
# Last Modified By : Manoj Srivastava
# Last Modified On : Sat Nov 24 23:22:56 2007
# Last Machine Used: anzu.internal.golden-gryphon.com
# Update Count     : 28
# Status           : Unknown, Use with caution!
# HISTORY          :
# Description      :
#
#


# Make sure we abort on error
set -e

Corpus_Top="/backup/classify/Done"
setq() {
    # Variable Value Doc string
    if [ "x$2" = "x" ]; then
        echo >&2 "$progname: Unable to determine $3"
        exit 1;
    else
        if [ ! "x$Verbose" = "x" ]; then
            echo "$progname: $3 is $2";
        fi
        eval "$1=\"\$2\"";
    fi
}

withecho () {
        echo " $@" >&2
        "$@"
}

usageversion () {
        cat >&2 <<END
Debian GNU/Linux $progname $pversion.
           Copyright (C) 2007 Manoj Srivastava.
This is free software; see the Artistic Licence for copying
conditions.  There is NO warranty.

Usage: $progname  [options]
Options:
  -h           print this message
  -p [ zero|one|two|four|ten ]
END
}

#
# Long term variables, which may be set in the config file or the
# environment:
# DEBUG rootdir workdir (if all original sources are kept in one dir)
#
#
action='withecho'

num_spam=$(ls -1 $Corpus_Top/Spam | wc -l)
num_good=$(ls -1 $Corpus_Top/Ham  | wc -l)
count=$(( $num_spam + $num_good))
chunk=100

here=$(pwd)

test -d Ham  || mkdir Ham
test -d Spam || mkdir Spam

start=0

# Command line
TEMP=$(getopt -a -s bash -o c:hs: --long help,percent -n 're-learn' -- "$@")
# Note the quotes around `$TEMP': they are essential!
eval set -- "$TEMP"
if [ $? != 0 ] ; then echo "Terminating..." >&2 ; exit 1 ; fi
while true ; do
    case "$1" in
        -c)      chunk="$2"               ; shift 2 ;;
        -h)    usageversion; exit 0  ; shift   ;;
        -s)      start="$2"               ; shift 2 ;;
        --)      shift ; break ;;
        '')   break ;;
        *) echo >&2 "Internal error!($1)"
            usageversion; exit 1           ;;
    esac
done

if [ ! -x mailtrainer.crm ]; then
    echo >&2 "Could not find mailtrainer"
    exit 2
fi

if [ $num_spam -gt $num_good ]; then
    max=$num_spam;
else
    max=$num_good;
fi

maxiter=1
maxiter=$(( $max / $chunk ))

#  For the very first run, there should be no validation -- we need
#  all the data points to initialize our css files
if [ ! -f spam.css ]; then
    cssutil -b -r -S 4194000  spam.css
    cssutil -b -r -S 4194000  nonspam.css
    percent='zero'
    v=''
else
    test -f spam.css     && cp -f spam.css     spam.css.save
    test -f nonspam.css  && cp -f nonspam.css  nonspam.css.save
    cssutil -b -r spam.css
    cssutil -b -r nonspam.css
fi

h=$(( $start * $chunk + 1))
s=$(( $start * $chunk + 1))

for interations in $(seq $start $maxiter); do
    limit=$(( $chunk * ( $interations + 1 )  ))
    if [[ $limit -gt $(( $max + $chunk)) ]]; then
        break
    fi
    echo      "Messages $(( $limit - $chunk + 1 )) --- $limit." >> junk.log
    echo >&2  "Messages $(( $limit - $chunk + 1 )) --- $limit."
    # Copy Ham
    if [[ $limit -lt $num_good  ]]; then
        gmax=$(( $limit + $chunk))
        if [[ $gmax -ge $num_good  ]]; then
            hupper=$(($num_good - $limit))
        else
            hupper=$chunk
        fi
        for i in $(ls -1tr $Corpus_Top/Ham/ | head -n $limit | tail -n $hupper); do
            cp -a $Corpus_Top/Ham/$i Ham/msg.$(print -f "%08i\n" $h)______${i##msg.[[:digit:]][[:digit:]][[:digit:]][[:digit:]][[:digit:]][[:digit:]][[:digit:]][[:digit:]]______}
            h=$(( $h + 1 ))
        done
    fi
    # Copy Spam
    if [[ $limit -lt $num_spam  ]]; then
        smax=$(( $limit + $chunk))
        if [[ $smax -ge $num_spam  ]]; then
            supper=$(($num_spam - $limit))
        else
            supper=$chunk
        fi
        for i in $(ls -1tr $Corpus_Top/Spam/ | head -n $limit | tail -n $supper); do
            cp -a $Corpus_Top/Spam/$i Spam/msg.$(print -f "%08i\n" $s)______${i##msg.[[:digit:]][[:digit:]][[:digit:]][[:digit:]][[:digit:]][[:digit:]][[:digit:]][[:digit:]]______}
            s=$(( $s + 1 ))
        done
    fi
    ./mailtrainer.crm  --spam=$here/Spam/ --good=$here/Ham/ --repeat=100 --streak=$(( 1 + 2 * $limit)) | perl -nle 'm/^\s+\S+|^Override|^\.CSS|^Running|\s+train|^Excel|^Finish|accuracy:/ && print'
    v="--validate=[_][_][_][_]"
    ./mailtrainer.crm  --spam=$here/Spam/ $v --repeat=1 --good=$here/Ham/ | perl -nle 'm/^\s+\S+|^Override|^\.CSS|^Running|\s+train|^Excel|^Finish|accuracy:/ && print' | tee junk1.log
    cat junk1.log >> junk.log
    echo -n Date: >> junk.log
    date --utc    >> junk.log
done




 # |  egrep -i '^( +|Finishing|.*train|Excell|Running)'