#! /bin/zsh -f # -*- Mode: Sh -*- # init-train.sh --- # Author : Manoj Srivastava ( srivasta@anzu.internal.golden-gryphon.com ) # Created On : Fri Nov 23 13:00:06 2007 # Created On Node : anzu.internal.golden-gryphon.com # Last Modified By : Manoj Srivastava # Last Modified On : Sat Nov 24 23:22:56 2007 # Last Machine Used: anzu.internal.golden-gryphon.com # Update Count : 28 # Status : Unknown, Use with caution! # HISTORY : # Description : # # # Make sure we abort on error set -e Corpus_Top="/backup/classify/Done" setq() { # Variable Value Doc string if [ "x$2" = "x" ]; then echo >&2 "$progname: Unable to determine $3" exit 1; else if [ ! "x$Verbose" = "x" ]; then echo "$progname: $3 is $2"; fi eval "$1=\"\$2\""; fi } withecho () { echo " $@" >&2 "$@" } usageversion () { cat >&2 <<END Debian GNU/Linux $progname $pversion. Copyright (C) 2007 Manoj Srivastava. This is free software; see the Artistic Licence for copying conditions. There is NO warranty. Usage: $progname [options] Options: -h print this message -p [ zero|one|two|four|ten ] END } # # Long term variables, which may be set in the config file or the # environment: # DEBUG rootdir workdir (if all original sources are kept in one dir) # # action='withecho' num_spam=$(ls -1 $Corpus_Top/Spam | wc -l) num_good=$(ls -1 $Corpus_Top/Ham | wc -l) count=$(( $num_spam + $num_good)) chunk=100 here=$(pwd) test -d Ham || mkdir Ham test -d Spam || mkdir Spam start=0 # Command line TEMP=$(getopt -a -s bash -o c:hs: --long help,percent -n 're-learn' -- "$@") # Note the quotes around `$TEMP': they are essential! eval set -- "$TEMP" if [ $? != 0 ] ; then echo "Terminating..." >&2 ; exit 1 ; fi while true ; do case "$1" in -c) chunk="$2" ; shift 2 ;; -h) usageversion; exit 0 ; shift ;; -s) start="$2" ; shift 2 ;; --) shift ; break ;; '') break ;; *) echo >&2 "Internal error!($1)" usageversion; exit 1 ;; esac done if [ ! -x mailtrainer.crm ]; then echo >&2 "Could not find mailtrainer" exit 2 fi if [ $num_spam -gt $num_good ]; then max=$num_spam; else max=$num_good; fi maxiter=1 maxiter=$(( $max / $chunk )) # For the very first run, there should be no validation -- we need # all the data points to initialize our css files if [ ! -f spam.css ]; then cssutil -b -r -S 4194000 spam.css cssutil -b -r -S 4194000 nonspam.css percent='zero' v='' else test -f spam.css && cp -f spam.css spam.css.save test -f nonspam.css && cp -f nonspam.css nonspam.css.save cssutil -b -r spam.css cssutil -b -r nonspam.css fi h=$(( $start * $chunk + 1)) s=$(( $start * $chunk + 1)) for interations in $(seq $start $maxiter); do limit=$(( $chunk * ( $interations + 1 ) )) if [[ $limit -gt $(( $max + $chunk)) ]]; then break fi echo "Messages $(( $limit - $chunk + 1 )) --- $limit." >> junk.log echo >&2 "Messages $(( $limit - $chunk + 1 )) --- $limit." # Copy Ham if [[ $limit -lt $num_good ]]; then gmax=$(( $limit + $chunk)) if [[ $gmax -ge $num_good ]]; then hupper=$(($num_good - $limit)) else hupper=$chunk fi for i in $(ls -1tr $Corpus_Top/Ham/ | head -n $limit | tail -n $hupper); do cp -a $Corpus_Top/Ham/$i Ham/msg.$(print -f "%08i\n" $h)______${i##msg.[[:digit:]][[:digit:]][[:digit:]][[:digit:]][[:digit:]][[:digit:]][[:digit:]][[:digit:]]______} h=$(( $h + 1 )) done fi # Copy Spam if [[ $limit -lt $num_spam ]]; then smax=$(( $limit + $chunk)) if [[ $smax -ge $num_spam ]]; then supper=$(($num_spam - $limit)) else supper=$chunk fi for i in $(ls -1tr $Corpus_Top/Spam/ | head -n $limit | tail -n $supper); do cp -a $Corpus_Top/Spam/$i Spam/msg.$(print -f "%08i\n" $s)______${i##msg.[[:digit:]][[:digit:]][[:digit:]][[:digit:]][[:digit:]][[:digit:]][[:digit:]][[:digit:]]______} s=$(( $s + 1 )) done fi ./mailtrainer.crm --spam=$here/Spam/ --good=$here/Ham/ --repeat=100 --streak=$(( 1 + 2 * $limit)) | perl -nle 'm/^\s+\S+|^Override|^\.CSS|^Running|\s+train|^Excel|^Finish|accuracy:/ && print' v="--validate=[_][_][_][_]" ./mailtrainer.crm --spam=$here/Spam/ $v --repeat=1 --good=$here/Ham/ | perl -nle 'm/^\s+\S+|^Override|^\.CSS|^Running|\s+train|^Excel|^Finish|accuracy:/ && print' | tee junk1.log cat junk1.log >> junk.log echo -n Date: >> junk.log date --utc >> junk.log done # | egrep -i '^( +|Finishing|.*train|Excell|Running)'