#!/bin/bash /usr/biotools/saco_convert -I genbank -O length XXXX.gbk > XXXX.length /usr/biotools/saco_extract -I genbank -f 'CDS|[rt]RNA' -b `gawk '{print int(0.001*$1);}' XXXX.length` -e `gawk '{print int(0.001*$1);}' XXXX.length` XXXX.gbk > XXXX.atlasfeatures.tab.temp gawk -F '\t' '{a=$1; sub(/_C$/, "", a); print a "\t" $2 "\t" $3 "\t"$4; }' XXXX.atlasfeatures.tab.temp > XXXX.atlasfeatures.tab rm -f XXXX.atlasfeatures.tab.temp perl -e '%val = ("AA", -5.37, "AC", -10.51, "AG", -6.78, "AT", -6.57, "CA", -6.57, "CC", -8.26, "CG", -9.61, "CT", -6.78, "GA", -9.81, "GC", -14.59, "GG", -8.26, "GT", -10.51, "TA", -3.82, "TC", -9.81, "TG", -6.57, "TT", -5.37); while (<>) {$_ = uc;@seq = split(/\t/, $_); $obs = 0; $sum = 0; for ($i = 0; $i+2 <= length($seq[1]); $i++) {if (defined($val{substr($seq[1], $i, 2)})) {$obs++; $sum += $val{substr($seq[1], $i, 2)};}} if ($obs) {printf("%.5f\n", $sum/$obs);} else {print "\n";}}' XXXX.atlasfeatures.tab > XXXX.atlasfeatures.ornstein.rowavg gawk '(NF >= 1){a++; b += $NF; c += $NF^2}END{print (a > 0 ? b/a : 0) "\t" (a > 1 ? sqrt((c-b^2/a)/(a-1)) : 1E99);}' XXXX.atlasfeatures.ornstein.rowavg > XXXX.atlasfeatures.ornstein.rowavg.stddev gawk '(FILENAME == "XXXX.atlasfeatures.ornstein.rowavg.stddev"){a = $1; b = $2; if (b==0) b=1;}(FILENAME != "XXXX.atlasfeatures.ornstein.rowavg.stddev"){for (i = 1; i < NF; i++) printf("%s\t", $i); print ($NF-a)/b;}' XXXX.atlasfeatures.ornstein.rowavg.stddev XXXX.atlasfeatures.ornstein.rowavg > XXXX.atlasfeatures.ornstein.rowavg.zscores perl -e '%val = ("AAA", 0.36, "AAC", 0.06, "AAG", 0.06, "AAT", 0.30, "ACA", 0.06, "ACC", 0.08, "ACG", 0.08, "ACT", 0.11, "AGA", 0.09, "AGC", 0.25, "AGG", 0.08, "AGT", 0.11, "ATA", 0.13, "ATC", 0.07, "ATG", 0.18, "ATT", 0.30, "CAA", 0.09, "CAC", 0.17, "CAG", 0.02, "CAT", 0.18, "CCA", 0.08, "CCC", 0.13, "CCG", 0.02, "CCT", 0.08, "CGA", 0.31, "CGC", 0.25, "CGG", 0.02, "CGT", 0.08, "CTA", 0.18, "CTC", 0.08, "CTG", 0.02, "CTT", 0.06, "GAA", 0.12, "GAC", 0.08, "GAG", 0.08, "GAT", 0.07, "GCA", 0.13, "GCC", 0.45, "GCG", 0.25, "GCT", 0.25, "GGA", 0.05, "GGC", 0.45, "GGG", 0.13, "GGT", 0.08, "GTA", 0.06, "GTC", 0.08, "GTG", 0.17, "GTT", 0.06, "TAA", 0.20, "TAC", 0.06, "TAG", 0.18, "TAT", 0.13, "TCA", 0.08, "TCC", 0.05, "TCG", 0.31, "TCT", 0.09, "TGA", 0.08, "TGC", 0.13, "TGG", 0.08, "TGT", 0.06, "TTA", 0.20, "TTC", 0.12, "TTG", 0.09, "TTT", 0.36); while (<>) {$_ = uc;@seq = split(/\t/, $_); $obs = 0; $sum = 0; for ($i = 0; $i+3 <= length($seq[1]); $i++) {if (defined($val{substr($seq[1], $i, 3)})) {$obs++; $sum += $val{substr($seq[1], $i, 3)};}} if ($obs) {printf("%.5f\n", $sum/$obs);} else {print "\n";}}' XXXX.atlasfeatures.tab > XXXX.atlasfeatures.travers.rowavg gawk '(NF >= 1){a++; b += $NF; c += $NF^2}END{print (a > 0 ? b/a : 0) "\t" (a > 1 ? sqrt((c-b^2/a)/(a-1)) : 1E99);}' XXXX.atlasfeatures.travers.rowavg > XXXX.atlasfeatures.travers.rowavg.stddev gawk '(FILENAME == "XXXX.atlasfeatures.travers.rowavg.stddev"){a = $1; b = $2; if (b==0) b=1;}(FILENAME != "XXXX.atlasfeatures.travers.rowavg.stddev"){for (i = 1; i < NF; i++) printf("%s\t", $i); print ($NF-a)/b;}' XXXX.atlasfeatures.travers.rowavg.stddev XXXX.atlasfeatures.travers.rowavg > XXXX.atlasfeatures.travers.rowavg.zscores gawk -F '\t' '{print $1;}' XXXX.atlasfeatures.tab > XXXX.atlasfeatures.names gawk -F '\t' '{if ($4 ~ /\/gene=\"/) {a = $4; sub(/^.*\/gene=\"/, "", a); sub(/\".*/, "", a);} else {a = $1;} print a "\t" $2 "\t" $3 "\t" $4}' XXXX.atlasfeatures.tab > XXXX.atlasfeatures.genenames.tab gawk -F '\t' '{print $1;}' XXXX.atlasfeatures.genenames.tab > XXXX.atlasfeatures.genenames.names paste XXXX.atlasfeatures.ornstein.rowavg.zscores XXXX.atlasfeatures.travers.rowavg.zscores XXXX.atlasfeatures.names XXXX.atlasfeatures.genenames.names | \ gawk '{a = $1^2; if ($2^2 > a) a = $2^2; c = split($3, b, /[_-]/); if (a > 4) print sqrt(a) "\t" b[c-1] "\t" b[c] "\t" b[c-2] "\t" $4;}' | \ sort -n -r | \ gawk '{for (i in a) if ((i-int(($2+$3)/2))^2 < 34862^2) next;}{a[int(($2+$3)/2)] = 1; if ($5 ~ /^[0-9a-zA-Z_][a-z\.A-Z0-9_\-]*$/) {printf("%-15s %8d %8d %1s %s\n", $4, ($2 < $3 ? $2 : $3), ($2 > $3 ? $2 : $3), ($2 > $3 ? "-" : "+"), $5); if (++b >= 20) exit;}}' > XXXX.genomeatlas.ann perl -pe 's/^([0-9]{1,3})(([0-9]{3})+)$/$a=$1; $b=$2; $b =~ s#([0-9]{3})#,\1#g; $a.$b/e' XXXX.length > XXXX.lengthtext perl -ne 'if (/^ ORGANISM /) {s/^ ORGANISM //; m/^([^ ][^ ]* [^ 0-9]+ .*?)$/; print $1;}' XXXX.gbk > XXXX.organism perl -ne 's/["]+//g; print;' < XXXX.organism | perl -ne "s/[']+//g; print;" > XXXX.organismStripQuote perl -ne 'if (/^DEFINITION .*[Cc]hromosome [0-9IVXY]+/) {s/^DEFINITION .*[Cc]hromosome ([0-9IVXY]+).*/$1/; print;}' XXXX.gbk > XXXX.chromosome genomeAtlasCf XXXX /usr/biotools/saco_convert -I genbank -O raw XXXX.gbk > XXXX.raw perl -ne 'chomp; next unless length($_) >= 21; s/[^ATGCatgc]/G/g; s/(.{60})/$1\n/g; print uc ( $_ )."\n";' XXXX.raw > XXXX.curvature.seq echo ' TWIST WEDGE DIRECTION ' > angles.dat echo '' >> angles.dat echo 'AA 35.62 7.20 -154.00' >> angles.dat echo 'AC 34.40 1.10 143.00' >> angles.dat echo 'AG 27.70 8.40 2.00' >> angles.dat echo 'AT 31.50 2.60 0.00' >> angles.dat echo 'CA 34.50 3.50 -64.00' >> angles.dat echo 'CC 33.67 2.10 -57.00' >> angles.dat echo 'CG 29.80 6.70 0.00' >> angles.dat echo 'CT 27.70 8.40 -2.00' >> angles.dat echo 'GA XXXX.90 5.30 120.00' >> angles.dat echo 'GC 40.00 5.00 180.90' >> angles.dat echo 'GG 33.67 2.10 57.00' >> angles.dat echo 'GT 34.40 1.10 -143.00' >> angles.dat echo 'TA XXXX.00 0.90 0.00' >> angles.dat echo 'TC XXXX.90 5.30 -120.00' >> angles.dat echo 'TG 34.50 3.50 64.00' >> angles.dat echo 'TT 35.62 7.20 154.00' >> angles.dat echo '' >> angles.dat echo '' >> angles.dat # we ignore return status from curvature, since it returns 0 and 1 pretty randomly /usr/biotools/indirect/curva -f XXXX.curvature.seq -o XXXX.curvature.map > /dev/null rm -f XXXX.curvature.ou2 angles.dat work.0 gawk '{print $2;}' XXXX.curvature.map > XXXX.curvature gzip -c XXXX.curvature > XXXX.curvature.gz /usr/biotools/saco_convert -A dna2 -I genbank -O skews XXXX.gbk > XXXX.baseskews.col gzip -c XXXX.baseskews.col > XXXX.baseskews.col.gz /usr/biotools/saco_convert -I genbank -O tab XXXX.gbk > XXXX.tab perl -e '%val = ("AA", -5.37, "AC", -10.51, "AG", -6.78, "AT", -6.57, "CA", -6.57, "CC", -8.26, "CG", -9.61, "CT", -6.78, "GA", -9.81, "GC", -14.59, "GG", -8.26, "GT", -10.51, "TA", -3.82, "TC", -9.81, "TG", -6.57, "TT", -5.37); while (<>) {$_ = uc;@seq = split(/\t/, $_); for ($i = 0; $i+2 <= length($seq[1]); $i++) {if (defined($val{substr($seq[1], $i, 2)})) {printf("%.3f\n", $val{substr($seq[1], $i, 2)});} else {print "\n";}}}' XXXX.tab > XXXX.ornstein gzip -c XXXX.ornstein > XXXX.ornstein.gz perl -e '%val = ("AAA", 0.36, "AAC", 0.06, "AAG", 0.06, "AAT", 0.30, "ACA", 0.06, "ACC", 0.08, "ACG", 0.08, "ACT", 0.11, "AGA", 0.09, "AGC", 0.25, "AGG", 0.08, "AGT", 0.11, "ATA", 0.13, "ATC", 0.07, "ATG", 0.18, "ATT", 0.30, "CAA", 0.09, "CAC", 0.17, "CAG", 0.02, "CAT", 0.18, "CCA", 0.08, "CCC", 0.13, "CCG", 0.02, "CCT", 0.08, "CGA", 0.31, "CGC", 0.25, "CGG", 0.02, "CGT", 0.08, "CTA", 0.18, "CTC", 0.08, "CTG", 0.02, "CTT", 0.06, "GAA", 0.12, "GAC", 0.08, "GAG", 0.08, "GAT", 0.07, "GCA", 0.13, "GCC", 0.45, "GCG", 0.25, "GCT", 0.25, "GGA", 0.05, "GGC", 0.45, "GGG", 0.13, "GGT", 0.08, "GTA", 0.06, "GTC", 0.08, "GTG", 0.17, "GTT", 0.06, "TAA", 0.20, "TAC", 0.06, "TAG", 0.18, "TAT", 0.13, "TCA", 0.08, "TCC", 0.05, "TCG", 0.31, "TCT", 0.09, "TGA", 0.08, "TGC", 0.13, "TGG", 0.08, "TGT", 0.06, "TTA", 0.20, "TTC", 0.12, "TTG", 0.09, "TTT", 0.36); while (<>) {$_ = uc;@seq = split(/\t/, $_); for ($i = 0; $i+3 <= length($seq[1]); $i++) {if (defined($val{substr($seq[1], $i, 3)})) {printf("%.3f\n", $val{substr($seq[1], $i, 3)});} else {print "\n";}}}' XXXX.tab > XXXX.travers gzip -c XXXX.travers > XXXX.travers.gz /usr/biotools/saco_convert -I genbank -O annotation XXXX.gbk > XXXX.ann gawk -F '\t' 'BEGIN{w=10000;o=5000;}function pad(pl,pc){ps="";for (k=1;k<=pl;k++) ps=ps pc; return ps;}{l=length($2); for (i = 1; i <= l; i += o) { s2=substr($2, i, w); ls2=length(s2); if (ls2 XXXX.10kfragments5kp.tab /usr/biotools/saco_convert -O fasta XXXX.10kfragments5kp.tab > XXXX.10kfragments5kp.fsa /usr/biotools/blast/bin/formatdb -i XXXX.10kfragments5kp.fsa -p F -t XXXX.10kfragments5kp mv XXXX.10kfragments5kp.fsa.nhr XXXX.10kfragments5kp.nhr mv XXXX.10kfragments5kp.fsa.nin XXXX.10kfragments5kp.nin mv XXXX.10kfragments5kp.fsa.nsq XXXX.10kfragments5kp.nsq rm -f error.log formatdb.log #EXCLUDE# /usr/biotools/indirect/uniqid > XXXX.10kfragments5kp.unique-string ##### DOES NOT SEEM TO BE USED AT ANY POINT ###### #EXCLUDE# echo "#/Users/tammi/Desktop/GenomeAtlas/blast-2.2.11/bin/usr/biotools/blast/bin/blastall -p blastn -F F -g F -d `pwd`/XXXX.10kfragments5kp -e 1 -i:4E534D5900007431" > XXXX.10kfragments5kp.blastn.repeats.cf ln XXXX.10kfragments5kp.tab XXXX.10kfragments5kp.blastn.tab ln XXXX.10kfragments5kp.blastn.tab XXXX.10kfragments5kp.blastn.repeats.tab /usr/biotools/saco_convert -O fasta XXXX.10kfragments5kp.blastn.repeats.tab > XXXX.10kfragments5kp.blastn.repeats.fsa /usr/biotools/blast/bin/blastall -p blastn -F F -g F -d XXXX.10kfragments5kp -e 1 -i XXXX.10kfragments5kp.blastn.repeats.fsa -o XXXX.10kfragments5kp.blastn.repeats.blast #EXCLUDE# /Users/tammi/Desktop/GenomeAtlas/runraw-4.0/runraw -b -c -p 0 -r -f XXXX.10kfragments5kp.blastn.repeats.cf XXXX.10kfragments5kp.blastn.repeats.fsa #EXCLUDE# get the unique string from the runraw cf file #EXCLUDE# mv XXXX.10kfragments5kp.blastn.repeats_`#gawk -F ':' '{print $2}' < XXXX.10kfragments5kp.blastn.repeats.cf` XXXX.10kfragments5kp.blastn.repeats.blast /usr/biotools/indirect/blastntable.pl XXXX.10kfragments5kp.blastn.repeats.blast > XXXX.10kfragments5kp.blastn.repeats.blastntable gawk '{printf("BLAST %8d %8d + %4.2f\n", 1, $1, 0);}' XXXX.length > XXXX.blastDirect.ann perl -ane '$F[16] =~ /_([0-9]+)-[0-9]+$/; $c = $1+$F[10]-1; $d = $1+$F[11]-1; $F[17] =~ /_([0-9]+)-[0-9]+$/; $e = $1+$F[13]-1; $f = $1+$F[14]-1; printf("BLAST %8d %8d + %4.2f %s %d %s %d\n", $c, $d, -log($F[7]+1e-9)/log(10), $F[16], $c, $F[17], $e) if $F[12] eq $F[15] && $d-$c >= 50 && $c != $e;' XXXX.10kfragments5kp.blastn.repeats.blastntable | \ sort -k5n >> XXXX.blastDirect.ann gawk '{if ($3 > a) a = $3; b++; c[5] += 0; d = $5; for (i = 6; i <= NF; i++) {c[i] += 0; d = d "\t" $i;} for (i = $2; i <= $3; i++) e[i] = d;}END{for (i = 1; i <= a; i++) if (i in e) {print e[i];} else {printf("%s", c[5]/(b != 0 ? b : 1)); for (j = 6; j in c; j++) printf("\t%s", c[j]/(b != 0 ? b : 1)); printf("\n");}}' XXXX.blastDirect.ann > XXXX.blastDirect.genomemap0 gzip -c XXXX.blastDirect.genomemap0 > XXXX.blastDirect.genomemap0.gz gawk '{printf("BLAST %8d %8d + %4.2f\n", 1, $1, 0);}' XXXX.length > XXXX.blastInverted.ann perl -ane '$F[16] =~ /_([0-9]+)-[0-9]+$/; $c = $1+$F[10]-1; $d = $1+$F[11]-1; $F[17] =~ /_([0-9]+)-[0-9]+$/; $e = $1+$F[13]-1; $f = $1+$F[14]-1; printf("BLAST %8d %8d + %4.2f\n", $c, $d, -log($F[7]+1e-9)/log(10)) if $F[12] ne $F[15] && $d-$c >= 50 && $c != $e;' XXXX.10kfragments5kp.blastn.repeats.blastntable | \ sort -k5n >> XXXX.blastInverted.ann gawk '{if ($3 > a) a = $3; b++; c[5] += 0; d = $5; for (i = 6; i <= NF; i++) {c[i] += 0; d = d "\t" $i;} for (i = $2; i <= $3; i++) e[i] = d;}END{for (i = 1; i <= a; i++) if (i in e) {print e[i];} else {printf("%s", c[5]/(b != 0 ? b : 1)); for (j = 6; j in c; j++) printf("\t%s", c[j]/(b != 0 ? b : 1)); printf("\n");}}' XXXX.blastInverted.ann > XXXX.blastInverted.genomemap0 gzip -c XXXX.blastInverted.genomemap0 > XXXX.blastInverted.genomemap0.gz /usr/biotools/indirect/makecm2.pl 100600_101010_000010 > 100600_101010_000010.cm2 /usr/biotools/indirect/makecm2.pl 001000_101010_100000 > 001000_101010_100000.cm2 /usr/biotools/indirect/makecm2.pl 001000_101010_100010 > 001000_101010_100010.cm2 /usr/biotools/indirect/makecm2.pl 101010_000010 > 101010_000010.cm2 /usr/biotools/indirect/makecm2.pl 101010_100000 > 101010_100000.cm2 /usr/biotools/indirect/makecm2.pl 100010_101010_001010 > 100010_101010_001010.cm2 /usr/biotools/indirect/makecm2.pl 001010_101010_100000 > 001010_101010_100000.cm2 genewiz -p XXXX.genomeatlas.ps XXXX.genomeatlas.cf rm XXXX.blastDirect.genomemap0 XXXX.10kfragments5kp.blastn.repeats.blastntable 001000_101010_100000.cm2 100010_101010_001010.cm2 001010_101010_100000.cm2 XXXX.baseskews.col XXXX.atlasfeatures.ornstein.rowavg.zscores 001000_101010_100010.cm2 XXXX.lengthtext XXXX.10kfragments5kp.blastn.repeats.blast XXXX.10kfragments5kp.fsa XXXX.curvature.gz XXXX.travers.gz XXXX.ornstein XXXX.atlasfeatures.genenames.tab XXXX.atlasfeatures.travers.rowavg.zscores XXXX.10kfragments5kp.blastn.tab XXXX.tab XXXX.atlasfeatures.genenames.names 101010_100000.cm2 XXXX.atlasfeatures.ornstein.rowavg.stddev XXXX.10kfragments5kp.nsq XXXX.atlasfeatures.ornstein.rowavg XXXX.atlasfeatures.tab XXXX.curvature XXXX.travers XXXX.blastInverted.genomemap0.gz XXXX.organismStripQuote XXXX.atlasfeatures.names XXXX.10kfragments5kp.blastn.repeats.tab XXXX.raw XXXX.10kfragments5kp.blastn.repeats.fsa XXXX.ornstein.gz XXXX.atlasfeatures.travers.rowavg.stddev 101010_000010.cm2 XXXX.baseskews.col.gz XXXX.10kfragments5kp.tab XXXX.atlasfeatures.travers.rowavg XXXX.blastDirect.genomemap0.gz XXXX.blastInverted.genomemap0 XXXX.organism XXXX.length 100600_101010_000010.cm2 XXXX.curvature.map XXXX.curvature.seq XXXX.chromosome