#! /bin/zsh -f
#                               -*- Mode: Sh -*-
# train.sh ---
# Author           : Manoj Srivastava ( srivasta@glaurung.internal.golden-gryphon.com )
# Created On       : Wed Nov 22 12:33:55 2006
# Created On Node  : glaurung.internal.golden-gryphon.com
# Last Modified By : Manoj Srivastava
# Last Modified On : Thu Nov 23 01:36:07 2006
# Last Machine Used: glaurung.internal.golden-gryphon.com
# Update Count     : 44
# Status           : Unknown, Use with caution!
# HISTORY          :
# Description      :
#
#

set -e

Corpus_Top="/backup/classify/Done"
chunk=500
startiter=1
maxiter=6

withecho () {
        echo " $@" >&2
        "$@"
}
action='withecho'


test -d Ham || mkdir Ham
test -d Spam || mkdir Spam

here=$(pwd)

num_spam=$(ls -1 $Corpus_Top/Spam | wc -l)
num_good=$(ls -1 $Corpus_Top/Ham  | wc -l)
if [ $num_spam -lt $num_good ]; then
    low_max=$num_spam;
else
    low_max=$num_good;
fi

limit=$chunk
for interations in $(seq $startiter $maxiter); do
    if [ $limit -gt $low_max ]; then
        limit=$low_max
    fi

    for type in Spam Ham; do
        test ! -d $type || rm -rf $type
        mkdir $type
        k=1
        for i in $(ls -1tr $Corpus_Top/$type/ | tail -n $limit); do
            cp $Corpus_Top/$type/$i $type/msg.$(print -f "%08i\n" $k)______$i
            k=$(( $k + 1 ))
        done
    done
    $action /usr/share/crm114/mailtrainer.crm  --spam=$here/Spam/       \
        --good=$here/Ham/ --repeat=100 --streak=$(( 10 + 2 * $limit))
    $action cp --sparse=always spam.css spam.css.$interations
    $action cp --sparse=always nonspam.css nonspam.css.$interations
    cssdiff spam.css nonspam.css
    echo "Done with $interations runs."
    limit=$(( 2 * $limit ))
    #sleep 10
    #echo -n "Done with $interations runs (chunk = $limit). Continue? "
    #read ans
done