#! /bin/zsh -f
#                               -*- Mode: Sh -*-
# train.sh ---
# Author           : Manoj Srivastava ( srivasta@glaurung.internal.golden-gryphon.com )
# Created On       : Wed Nov 22 12:33:55 2006
# Created On Node  : glaurung.internal.golden-gryphon.com
# Last Modified By : Manoj Srivastava
# Last Modified On : Sat Nov 25 23:58:46 2006
# Last Machine Used: glaurung.internal.golden-gryphon.com
# Update Count     : 57
# Status           : Unknown, Use with caution!
# HISTORY          :
# Description      :
#
#

set -e

Corpus_Top="/backup/classify/Done"
chunk=500
startiter=0

withecho () {
    echo " $@" >&2
    "$@"
}
action='withecho'

usageversion () {
    cat >&2 <<END
Usage: $progname  [options]
Options:
  -h           print this message
  -c size      Set chunk size. nominally 500
  -n           "Dry-run" mode - No action taken, only print commands.
  -s int       Start at iteration given, rather than 1
  -v           Make the command verbose

END
}

# parse Command line
# Note that we use `"$@"' to let each command-line parameter expand to a
# separate word. The quotes around `$@' are essential!
# We need TEMP as the `eval set --' would nuke the return value of getopt.
TEMP=$(getopt -a -s bash -o c:hnvs: -n 'train' -- "$@")
if [ $? != 0 ] ; then echo "Terminating..." >&2 ; exit 1 ; fi

# Note the quotes around `$TEMP': they are essential!
eval set -- "$TEMP"
while true ; do
    case "$1" in
        -h|--help) usageversion; exit 0   ; shift   ;;
        -n)      action='echo';docmd='NO' ; shift   ;;
        -s)      startiter="$2"           ; shift 2 ;;
        -c)      chunk="$2"               ; shift 2 ;;
        -v)      VERBOSE=1                ; shift   ;;
        --)      shift ; break ;;
        *) echo >&2 "Internal error!($i)"
            usageversion; exit 1           ;;
    esac
done

test -d Ham || mkdir Ham
test -d Spam || mkdir Spam

here=$(pwd)
incr=$(( $chunk / 2 ))
num_spam=$(ls -1 $Corpus_Top/Spam | wc -l)
num_good=$(ls -1 $Corpus_Top/Ham  | wc -l)
if [ $num_spam -gt $num_good ]; then
    max=$num_spam;
else
    max=$num_good;
fi

maxiter=1
maxiter=$(( ( $max - $chunk ) / $incr + 1 ))

for interations in $(seq $startiter $maxiter); do
    limit=$(( $chunk + $interations * $incr ))
    if [[ $limit -gt $(( $max + $chunk)) ]]; then
        break
    fi
    echo      "Messages $(( $limit - $chunk + 1 )) --- $limit." >> junk.log
    echo >&2  "Messages $(( $limit - $chunk + 1 )) --- $limit."
    for type in Spam Ham; do
        test ! -d $type || rm -rf $type
        mkdir $type
        k=1
        for i in $(ls -1tr $Corpus_Top/$type/ | head -n $limit | tail -n $chunk); do
            cp $Corpus_Top/$type/$i $type/msg.$(print -f "%08i\n" $k)______$i
            k=$(( $k + 1 ))
        done
    done
    $action /usr/share/crm114/mailtrainer.crm  --spam=$here/Spam/       \
        --good=$here/Ham/ --repeat=100 --streak=$(( 10 + 2 * $chunk))
    $action cp --sparse=always spam.css spam.css.$interations
    $action cp --sparse=always nonspam.css nonspam.css.$interations
    if [[ $interations -gt 1 ]]; then
        test ! -f spam.css.$(( $interations - 2 )) ||
          $action rm -f spam.css.$(( $interations - 2 ))
        test ! -f nonspam.css.$(( $interations - 2 )) ||
          $action rm -f nonspam.css.$(( $interations - 2 ))
    fi

    cssdiff spam.css nonspam.css >> junk.log
    echo "Done with $(( $interations + 1 )) runs." >> junk.log
    echo >&2 "Done with $(( $interations + 1 )) runs."
    #echo -n "Done with $(( $interations + 1 )) runs. Continue? " >&2
    #read ans
done

#  Use the following to detect which messages caused crm to have to
#  correct a bad classification -- since perhaps it was the
#  classification, not crm, which is wrong.

#egrep 'refute| ER ' testlog  | \
# perl -pl -e 's/(\d)+______//g; s/Spam file /less Spam\//g; '          \
#          -e 's/Good file /less Ham\//g; s/ ER.*$//g; s/ \-\- .*$//g;' \
#             | sort -u  > /tmp/junk
#