#!/bin/bash

##----------------------------------------------------------#
##                                                          #
##                     G A 2 C E L L                        # 
##                                                          #
##----------------------------------------------------------#
##  version 1.0, written by Luke Abraham 2006/08/03         #
##                                                          #
##                                                          #
## This script parses the .castep file of a genetic         #
## algorithm calculation and creates individual cell        #
## files of all the structures found during the course      #
## of the calculation.                                      #
##                                                          #
##----------------------------------------------------------#
## Requires:                                                #
##                                                          #
##   * newtep2cell to create the cell files (needs to be    #
##     in the users $PATH).                                 #
##   * This script also makes use of awk, grep, printf      #
##     and which, which are checked for. BASH must also     #
##     be available on the system.                          #
##                                                          #
##----------------------------------------------------------#
## Usage:                                                   #
##                                                          #
##   ga2cell foo.castep                                     #
##                                                          #
## Output:                                                  #
##                                                          #
##   This script will output a series of files named        #
##                                                          #
##      foo_genXXX_memXXX.cell                              #
##      foo_genXXX_kidXXX.cell                              #
##                                                          #
##   where the _mem files contain the cell information of   # 
##   the parents (members) and the _kid files contain the   #
##   information of the offspring produced from crossover.  #
##   The enthalpy of each structure is also saved to the    #
##   resultant .cell files.                                 #
##   This script makes a large number of files of the       #
##   form foo*tmp, where * is a varying number of           #
##   characters. These are made in the same directory       #
##   as foo.castep, and are removed at the completion       #
##   of the script.                                         #
##   Only configurations of fully minimised members will    #
##   be parsed - initial configurations will not be.        #
##                                                          #
##----------------------------------------------------------#

 
# check have correct number of arguments
if [ ${#} -ne 1 ]; then
    echo "ERROR - incorrect number of arguments"
    echo "        run with -h or --help for usage information"
    exit 1
fi


# help information
if [ "$1" = "-h" ]||[ "$1" = "--help" ]; then
    echo " ga2cell"
    echo " "
    echo " usage:"
    echo "         ga2cell foo.castep"
    echo " "
    echo " output:"
    echo "         foo_genXXX_memXXX.cell"
    echo "         foo_genXXX_kidXXX.cell"
    echo " "
    echo " written by Luke Abraham 02/08/06"
    exit 0
fi


# check that which, awk, grep, printf and newtep2cell are 
# installed on the system
function check_exists {
    fnchk=`which $1 &> /dev/null`
    if [ $? -ne 0 ]; then
	echo "ERROR - $1 is required in the use of ga2cell"
	exit 2
    fi
}
chklst="which awk grep printf newtep2cell"
for func in $chklst; do
    check_exists $func
done 


# get seed for .castep file - will be used in making other files
seed=`echo $1 | awk 'BEGIN{FS="."}{print $1}'`

# set the current directory
cdir=$PWD'/'
stem=$seed'_cells'
# set the print directory
wdir=$cdir$stem'/'

# make and move to write directory
if [ ! -d $wdir ]; then
    mkdir $wdir
fi
cd $wdir

fname=$cdir$seed.castep
# if doing a parallel calculation, all the useful stuff is there
farmfile=$cdir$seed'_farm001.castep'
# check and see if it exists
if [ -e $farmfile ]; then
    readfile=$farmfile
else
    readfile=$fname
fi


# check and see if .castep file exists
if [ ! -r ${fname} ]; then
    echo "ERROR - file $fname does not exist or is not readable"
    echo "        run with -h or --help for usage information"
    exit 3
fi


# intro
echo "     creating CASTEP .cell files from GA run:"


# get number of generations and number of population members
# if less than one generation has been performed (e.g. because
# of an abort) this will default to zero.
maxgen=`grep "GA: Results for end of generation:" $fname | awk 'BEGIN{x=0}{x=$7}END{print x}'`
pop=`awk 'BEGIN{x=0}/population size/{x=$7}END{print x}' $readfile`
# test that pop contains a value
if [ $pop -eq 0 ];then
    echo "ERROR - could not determine population size"
    exit 4
fi


# awk scripts for parsing .castep file
# this script cuts off the full section containing all members
awk1='
    BEGIN { pval=0 }
    /GA: Results for end of generation:/{if ($7 == val) pval=1} 
    {if (pval==1) print $0}
    /GA: Finished GA generation/{if ($5 == val) pval=2} 
    {if (pval==2) exit}
    '
# this script separates out the members
awk2='
    BEGIN { pval=0; tval=val+1 }
    /GA: Final configuration of member/{ if ($6 == val) pval=1 } 
    { if (pval==1) print $0 }
    /GA: Final configuration of member/{ if ($6 == tval) pval=2 } 
    { if (pval==2) exit }
    '
# this script is a replacement for awk1, but for the offspring (children/kids)
awkkid='
    BEGIN{ pval=0 }
    /GA: Results for children born in generation:/{ if ($8 == val) pval=1 } 
    { if (pval==1) print $0 }
    /GA: Finished GA generation/{ if ($5 == val) pval=2 } 
    { if (pval==2) exit }
    '

# newtep2cell script from the CASTEP cteprouts packages is required
# done this way instead of hard coding for ease of portability to 
# different machines
n2c=`which newtep2cell`


# required to test for older versions of the code
# works out if have printed out information at end of generation zero
mingen=`awk 'BEGIN{x=1}/GA: Results for end of generation:    0/{x=$7; exit}END{print x}' $fname `
if [ "$mingen" -eq 0 ]; then
    i=0
else
    i=1
fi


# required to test for older versions of the code
# works out if have printed out information from offspring
minkid=`awk 'BEGIN{x=0}/GA: Results for children born in generation:/{x=$8; exit}END{print x}' $fname `
if [ $minkid -ne 0 ]; then
    kidcheck=1
else
    kidcheck=0
fi


# work out if in older versions of code we have a fixed cell calculation
fixtmp=$seed'_cell.tmp'
# gets the cell information from the .castep file
awkcell1='
    BEGIN{ pval=0 }
    /Real Lattice/{ pval=1 } 
    { if (pval>0) pval=pval+1 }
    { if (pval>0) print $0 }
    { if (pval>4) exit }
    '
# this works out if there is a fixed cell calculation
# this is only for older versions of the code. It may
# break with some suggested changes I have made.
# defaults to NOT printing the cell
awkcell2='
    BEGIN{x=0}
    /Number of cell constraints/{ con=$5 } 
    /Cell constraints are/{
    if ((con==6 && $4==$5) && (($6==$7 && $8==$9) && ($4==$6 && $4==$8))) 
         x=1;
    exit
    }
    END{print x}
    '
# work out if we have a fixed cell (1) or variable cell (0)
printcell=`awk "$awkcell2" $readfile`
if [ -z ${printcell} ]; then
    echo "ERROR - could not determine cell attributes"
    exit 7
fi    
if [ ${printcell} -eq 1 ]; then
    # we need to print the cell
    awk "$awkcell1" $fname > $fixtmp
fi
# later versions of the code WILL include the cell, even in a 
# fixed cell calculation. This function will check and see 
# if this is the case here
function check_cell {
    chkval=`grep -i "Real Lattice" $1 &> /dev/null`
    echo $?
}


# checks that all members in the final generation have been
# printed out. Will only trap if abort occurred in writing out stage
# If it occurred when halfway through a member then there is noting it
# can do - it is very hard to tell in this case.
function check_member {
    chkval=`grep -i "GA: Final configuration of" $1 | awk 'BEGIN{x=0}{x=$NF}END{print x}'`
    echo $chkval
}


# this function writes the progress to the screen
# needs a few variables
# first - work out the total number of files that we're going to make
# do we have a generation 0
if [ $mingen -eq 0 ]; then
    # do we have kids to think about?
    if [ $kidcheck -eq 1 ]; then
	# number after including kids and gen 0
	numfiles=`expr \( $pop \* $maxgen \* 2 \) + $pop`
    else
	# this should never happen - no kids but a gen 0
	numfiles=`expr \( $pop \* $maxgen \) + $pop`
    fi
else
    # no generation 0
    if [ $kidcheck -eq 1 ]; then
	# this should never happen - kids but no gen 0
        numfiles=`expr $pop \* $maxgen \* 2`
    else
	# original number
        numfiles=`expr $pop \* $maxgen`
    fi  
fi
# initialise some counters, and set up the hash symbols
filenum=0
str="#"
printstr=" " 
# may need to set counter bigger than 2
awkcount='
         BEGIN{x=(100.0*(var1/var2))}
         {exit}
         END{if ( x <= 2 ) 
                printf "%i", 2  
             else if ( x%2 == 0 ) 
                printf "%i", x
             else 
                printf "%i", (x+2) 
             }
         '
check=`awk -v var1=1 -v var2=$numfiles "$awkcount" $fname`
hashinc=`expr $check \/ 2`
# set up initial number of hash's - may not be zero if 50 is not a 
# multiple of hashinc
hashorig=`awk -v var1=$hashinc -v var2=50 'BEGIN{x=(var2%var1) ; printf "%i", x}{exit}' $fname`
hashcheck=0
while [ $hashcheck -lt $hashorig ]; do
    printstr=$printstr$str
    let hashcheck+=1
done
checkinc=$check
function write_screen {
    # work out the percentage - uses awk for type casting purposes
    percent=`awk -v var1=$filenum -v var2=$numfiles 'BEGIN{printf "%i", 100.0*(var1/var2)}{exit}' $fname`
    # do we need to append another hash?
    if [ $percent -ge $check ]; then
	# yes - add on an extra hash symbol at the end
	hashcheck=0
	while [ $hashcheck -lt $hashinc ]; do
	    printstr=$printstr$str
	    let hashcheck+=1
	done
	# append the percentage check
	let check+=$checkinc
    fi
    # write percentage
    printf "%8i%c " $percent \%
    # write string of hash's followed by file progress
    if [ -z $printstr ]; then
	# may not have any hash's to print yet
	printf " %i/%i \r" $filenum $numfiles
    else
	printf "%s %i/%i \r" $printstr $filenum $numfiles	
    fi
    # increase the file checked count
    let filenum+=1
}
# write out an initial percentage
write_screen


# this function actually does all the hard work
function create_cell {
    seedfile=$1
    parsefile=$2
    tmpcatfile=$seedfile.tmp
    catfile=$seedfile'_mem.tmp'
    cellfile=$seedfile.cell
    
    # get cell info
    awk -v val=$j "$awk2" $parsefile > $tmpcatfile
    
    # put enthalpy info as title
    title=`grep "Final enthalpy of member" $tmpcatfile | awk '{print $0; exit}'`
    # format required from newtep2cell
    echo " ************************************ Title ************************************" > $catfile
    echo $title >> $catfile
    
    # do we need to add the cell in a fixed cell case?
    # not required in new versions of the code
    if [ ${printcell} -eq 1 ]; then
	# check and see if we actually do have cell info
	cellchk=`check_cell $tmpcatfile`
	if [ ${cellchk} -ne 0 ]; then
	    cat $fixtmp >> $catfile
	fi
    fi
    # now add cell info after title
    cat $tmpcatfile >> $catfile
    
    # now create the .cell file
    $n2c $catfile > $cellfile

    # tell user of progress
    write_screen

    # cleanup
    rm -f $tmpcatfile
    rm -f $catfile 

}


# loop over number of generations
while [ $i -le $maxgen ]; do
    
    #echo "Parsing Generation $i:"
    
    # create temporary file for putting required info on all members  
    genfile=`printf "%s_gen%3.3i.tmp" $seed $i`
    awk -v val=$i "$awk1" $fname > $genfile
    
    # check that we have a complete list of members
    finval=`check_member $genfile`

    # get info on kids does not need to be done on generation 0
    if [ $i -gt 0 ]&&[ $kidcheck -gt 0 ]; then
	
	# create temporary file for putting required info on all offspring  
	kidfile=`printf "%s_gen%3.3i_kids.tmp" $seed $i`
	awk -v val=$i "$awkkid" $fname > $kidfile
	
	# check that we have a complete list of members
	kidfinval=`check_member $kidfile`

	# now operate on the offspring individually
	j=1
	while [ $j -le $pop ]; do
	    
	    # check that we have a complete set of members
	    if [ $j -gt $kidfinval ]&&[ $kidfinval -ne $pop ]; then
		# print newline after all the hash marks 
		printf "\n"  
		echo "WARNING: Run out of offspring - exiting"
		# clean up files before exiting
		rm -f $kidfile
		if [ -e $fixtmp ]; then
		    rm -f $fixtmp 
		fi
		exit 5
	    fi

	    #echo "  generating .cell file for kid $j..."
	  
	    # create filename
	    memfile=`printf "%s_gen%3.3i_kid%3.3i" $seed $i $j`
	    
	    # now create cell file
	    create_cell $memfile $kidfile

	    let j+=1    
	done
	
	# cleanup
	rm -f $kidfile
    fi
    
    # now operate on the members individually
    j=1
    while [ $j -le $pop ]; do
	
	# check that we have a complete set of members
	if [ $j -gt $finval ]&&[ $finval -ne $pop ]; then
	    # print newline after all the hash marks
	    printf "\n"
	    echo "WARNING: Run out of members - exiting"
	    # clean up files before exiting
	    rm -f $genfile
	    if [ -e $fixtmp ]; then
		rm -f $fixtmp 
	    fi
	    exit 6
	fi

	#echo "  generating .cell file for member $j..."
	
	# create filename
	memfile=`printf "%s_gen%3.3i_mem%3.3i" $seed $i $j`
	
	# now create cell file
	create_cell $memfile $genfile

	let j+=1
    done
    
    let i+=1
    # cleanup
    rm -f $genfile
done


# final bit of cleanup
if [ -e $fixtmp ]; then
    rm -f $fixtmp 
fi

# print newline after all the hash marks
printf "\n"
# tell user where the output files can be found
echo "     output files can be found the "$stem" subdirectory"

exit 0
# end of script ga2cell
# This script is written in bash due to its ability to hold
# the awk scripts as internal variables. If written in csh
# then these would need to be separate external files.
#
# List of error codes:
#
#  0  script has exited correctly, either from a .cell conversion
#     or a -h/--help enquiry
#  1  incorrect number of arguments
#  2  lacking a required resource - see error message for more details
#      - will be one of: which, awk, grep, printf or newtep2cell
#  3  the .castep file does not exist or is not readable by the script
#  4  the script could not determine the population size
#  5  there is less than the normal (population size) number of offspring
#  6  there is less than the normal (population size) number of members
#  7  the script was unable to tell if this was a fixed or variable cell 
#     calculation