#! /bin/zsh -f # -*- Mode: Sh -*- # train.sh --- # Author : Manoj Srivastava ( srivasta@glaurung.internal.golden-gryphon.com ) # Created On : Wed Nov 22 12:33:55 2006 # Created On Node : glaurung.internal.golden-gryphon.com # Last Modified By : Manoj Srivastava # Last Modified On : Sat Nov 25 23:58:46 2006 # Last Machine Used: glaurung.internal.golden-gryphon.com # Update Count : 57 # Status : Unknown, Use with caution! # HISTORY : # Description : # # set -e Corpus_Top="/backup/classify/Done" chunk=500 startiter=0 withecho () { echo " $@" >&2 "$@" } action='withecho' usageversion () { cat >&2 <<END Usage: $progname [options] Options: -h print this message -c size Set chunk size. nominally 500 -n "Dry-run" mode - No action taken, only print commands. -s int Start at iteration given, rather than 1 -v Make the command verbose END } # parse Command line # Note that we use `"$@"' to let each command-line parameter expand to a # separate word. The quotes around `$@' are essential! # We need TEMP as the `eval set --' would nuke the return value of getopt. TEMP=$(getopt -a -s bash -o c:hnvs: -n 'train' -- "$@") if [ $? != 0 ] ; then echo "Terminating..." >&2 ; exit 1 ; fi # Note the quotes around `$TEMP': they are essential! eval set -- "$TEMP" while true ; do case "$1" in -h|--help) usageversion; exit 0 ; shift ;; -n) action='echo';docmd='NO' ; shift ;; -s) startiter="$2" ; shift 2 ;; -c) chunk="$2" ; shift 2 ;; -v) VERBOSE=1 ; shift ;; --) shift ; break ;; *) echo >&2 "Internal error!($i)" usageversion; exit 1 ;; esac done test -d Ham || mkdir Ham test -d Spam || mkdir Spam here=$(pwd) incr=$(( $chunk / 2 )) num_spam=$(ls -1 $Corpus_Top/Spam | wc -l) num_good=$(ls -1 $Corpus_Top/Ham | wc -l) if [ $num_spam -gt $num_good ]; then max=$num_spam; else max=$num_good; fi maxiter=1 maxiter=$(( ( $max - $chunk ) / $incr + 1 )) for interations in $(seq $startiter $maxiter); do limit=$(( $chunk + $interations * $incr )) if [[ $limit -gt $(( $max + $chunk)) ]]; then break fi echo "Messages $(( $limit - $chunk + 1 )) --- $limit." >> junk.log echo >&2 "Messages $(( $limit - $chunk + 1 )) --- $limit." for type in Spam Ham; do test ! -d $type || rm -rf $type mkdir $type k=1 for i in $(ls -1tr $Corpus_Top/$type/ | head -n $limit | tail -n $chunk); do cp $Corpus_Top/$type/$i $type/msg.$(print -f "%08i\n" $k)______$i k=$(( $k + 1 )) done done $action /usr/share/crm114/mailtrainer.crm --spam=$here/Spam/ \ --good=$here/Ham/ --repeat=100 --streak=$(( 10 + 2 * $chunk)) $action cp --sparse=always spam.css spam.css.$interations $action cp --sparse=always nonspam.css nonspam.css.$interations if [[ $interations -gt 1 ]]; then test ! -f spam.css.$(( $interations - 2 )) || $action rm -f spam.css.$(( $interations - 2 )) test ! -f nonspam.css.$(( $interations - 2 )) || $action rm -f nonspam.css.$(( $interations - 2 )) fi cssdiff spam.css nonspam.css >> junk.log echo "Done with $(( $interations + 1 )) runs." >> junk.log echo >&2 "Done with $(( $interations + 1 )) runs." #echo -n "Done with $(( $interations + 1 )) runs. Continue? " >&2 #read ans done # Use the following to detect which messages caused crm to have to # correct a bad classification -- since perhaps it was the # classification, not crm, which is wrong. #egrep 'refute| ER ' testlog | \ # perl -pl -e 's/(\d)+______//g; s/Spam file /less Spam\//g; ' \ # -e 's/Good file /less Ham\//g; s/ ER.*$//g; s/ \-\- .*$//g;' \ # | sort -u > /tmp/junk #