Tuesday, April 21, 2009

19) Calculating Entropy of MSA by variable windowsize

The short version
Save file as "entropy2"
#!/bin/bash
L=`head -1 $2 |  wc -c | awk '{print $1}' | sed 's/.*/&-1/' | bc`
x=`expr $L - $1 + 1`
n=`cat $2 |wc -l `
w=$1
for i in `seq $x`
do
j=`expr $i + $w - 1`
cat $2 | cut -c$i-$j| sort | uniq -c
echo -n "Position=$i "
cat $2 | cut -c$i-$j| sort | uniq -c | awk  -v n="$n" '{ h=( -1 /  n * log(2))  * ( log($1) - log(n) ); H=h+H } END {print "Entropy=" H }'
echo -------
done

Example input file: input.seq
ASIFKHANT
BSIFKHANT
CTIFKHANT
DSIFKHANT
BSIFKHANT
DSAFKHANT

$ chmod +x entropy2
$  entropy2  2     input.seq
 
     1 AS
      2 BS
      1 CT
      2 DS
Position=1 Entropy=0.667818
-------
      1 SA
      4 SI
      1 TI
Position=2 Entropy=0.460826
-------
      1 AF
      5 IF
Position=3 Entropy=0.228055


The long version

#!/bin/bash
# Usage:  prog
# this program computes the Entropy of window size w for a
# multiple sequence alignment with gaps marked with "-" only.
# Date: 22 April 2009
# ToDo:
# does not trap missing $1
# does not handle blank or missing data or missing line
# works for windows = 1 onwards. w cannot be = or less than 0.
# need to check the equation for any errors
# assumes that the data input is well formated according to msa output
# does not trap data input errors and may still generate errorneous output
# does not cater for description tag of the clustal MSA output
# only handles one catenated sequence per line

input=$2
# Assumes the first line is the longest
# L = number of column positions to be computed
L=`head -1 $2 |  wc -c | awk '{print $1}' | sed 's/.*/&-1/' | bc`

# x - number of windows
x=`expr $L - $1 + 1`
echo Window Size: $1 size
echo No of Windows: $x windows
# number of sequences (=rows, since only one cateneted sequence per line allowed)
n=`cat $input |wc -l `
echo No of Lines: $n lines
w=$1

# i = current residue position begin
for i in `seq $x`
do


# i = current residue position end
j=`expr $i + $w - 1`

# print aligned sequences within window and count them
cat $input | cut -c$i-$j| sort | uniq -c

# Report position number
echo -n "Position=$i "
# Compute Entropy in bits of these sequences
cat $input | cut -c$i-$j| sort | uniq -c | awk  -v n="$n" '{ h=( -1 /  n * log(2))  * ( log($1) - log(n) ); H=h+H } END {print "Entropy=" H }'
echo -------
done


No comments:

Post a Comment