#!/usr/bin/perl use strict; use Getopt::Long; use XML::Compile; use XML::Compile::WSDL11; use XML::Compile::Transport::SOAPHTTP; use MIME::Base64; use Data::Dumper; require "xml-compile.pl"; require "fasta.inc.pl"; my ( $o,$help,$begin,$end,@customlanes , @dblanes , $windowBegin, $windowEnd,$modus,$REFERENCE_GENOME,$proteins,$orfs,$blastcfg,$customcfg,$REFERENCE_ANNOTATIONS,$dnap,$TITLE); &GetOptions ( "h" => \$help, "o:s" , \$o, "modus:s" , \$modus, # geometry, circle/linear "ref:s" , \$REFERENCE_GENOME, # reference genome in fasta format "proteins:s" , \$proteins , # reference sequence, either orfs or proteins "orfs:s" , \$orfs , # reference sequence, either orfs or proteins "blastcfg:s" , \$blastcfg, # setup of blast lanes "customcfg:s" , \$customcfg, # setup of custom lanes "ann:s" , \$REFERENCE_ANNOTATIONS, # annotation file "begin:f" , \$windowBegin, # zoom begin, optional "end:f" , \$windowEnd, # zoom end, optional "dnap:s" , \$dnap, # DNA parameters "title:s" , \$TITLE, # title ); if ( $help ) { while () { print ; } close DATA; exit; } # ensure default and print setup... if ( defined ( $begin ) and defined ( $end ) ) { print STDERR "# zooming region $begin -> $end\n"; } $TITLE = "[no title]" unless defined $TITLE; print STDERR "# title set to '$TITLE'\n"; $o = "ps" unless $o =~ /ps|pdf/; print STDERR "# output format is $o\n"; $modus = 'circle' unless defined $modus; $modus = 'circle' unless $modus =~ /circle|linear/; print STDERR "# modus is '$modus'\n"; ### Load details and setup for the lanes (see below of this file) print STDERR "# loading reference genome ...\n"; my @GENOME = read_fasta($REFERENCE_GENOME); print STDERR "# loading orfs ...\n" if defined $orfs; my @ORFS = read_fasta($orfs) if defined $orfs; print STDERR "# loading proteins ...\n" if defined $proteins; my @PROTEINS = read_fasta($proteins) if defined $proteins; my ( @DB_SRC , @DB_LEGENDS, @DB_PROGRAM,@DB_COLOR); # read the setup of BLAST lanes (read from config) @dblanes = parse_blast_lanes ( $blastcfg ) if defined $blastcfg; # setup custom lanes (read from config) @customlanes = parse_custom_lanes ( $customcfg ) if defined $customcfg; my @DB; print STDERR "# reading external files and build hash of sequences ...\n"; foreach my $n (0 .. $#dblanes) { my $a; my $r; my $range_type; my $seq_type; $range_type = 'byrange' if defined $dblanes[$n]->{byrange}; $range_type = 'byaverage' if defined $dblanes[$n]->{byaverage}; if ( $dblanes[$n]->{program} eq "blastp" or $dblanes[$n]->{program} eq "blastx") { $seq_type = 'proteins'; } else { $seq_type = 'orfs'; } push @DB , { legend => $dblanes[$n]->{legend} , program => $dblanes[$n]->{program} , color => $dblanes[$n]->{color}, $range_type => $dblanes[$n]->{$range_type}, $seq_type => $dblanes[$n]->{seq} }; } sub parse_blast_lanes { my $file = $_[0]; my $n=-1; my @ret; print STDERR "# parsing blast lane configuration ($file) ...\n"; open CFG , $file or die "parse_blast_lanes(): can't open $file:$!"; while () { chomp; next unless /^([a-z]+)\s*:\s*(.*)/; my ($key,$value) = ($1,$2); if ( $key eq 'legend' ) { $n++; $ret[$n]->{legend} = $value; print STDERR "# .. parsing blast lane ($value) ...\n"; } elsif ( $key eq 'program' ) { $ret[$n]->{program} = $value; print STDERR "# .. .. program: $value\n"; } elsif ( $key eq 'color') { my @color_parts = split("_",$value); my @color_part_index; if ( $#color_parts == 2 ) { @color_part_index = ('from','via','to'); } elsif ($#color_parts == 1) { @color_part_index = ('from','to'); } elsif ($#color_parts == 0) { @color_part_index = ('from','to'); unshift @color_parts , "000000"; } else { die "invalid color range: $value\n"; } print STDERR "# .. .. parsing color $value\n"; foreach my $i (0 .. $#color_part_index) { ( $ret[$n]->{color}->{$color_part_index[$i]}->{r}, $ret[$n]->{color}->{$color_part_index[$i]}->{g}, $ret[$n]->{color}->{$color_part_index[$i]}->{b} ) = ($1,$2,$3) if $color_parts[$i] =~ /^(\d{1,2}),*(\d{1,2}),*(\d{1,2})$/; print STDERR "# .. .. .. color $color_part_index[$i]: r:$1, g:$2, b:$3\n"; } } elsif ( $key eq "source" ) { $ret[$n]->{file} = $value; my @seqs; open SEQ , $value or die "unable to open sequences for blast lane:$value\n"; print STDERR "# .. parsing sequene source '$value' ...\r"; my $id = -1; while (defined ( my $line = ) ) { if ( $line =~ /^>/ ) { $id++; print STDERR "# .. parsing sequene source '$value' ... $id\r" if $id % 100 == 0; } elsif ($line =~ /([A-Za-z]+)/) { $seqs[$id] .= $1; } } close SEQ; print STDERR "# .. parsing sequene source '$value' ... $id done\n"; # strip weird chars ... $ret[$n]->{seq} =~ s/[BJOU]{1,1}/X/gi; # join by comma $ret[$n]->{seq} = join ( "," , @seqs ) ; } elsif ( $key eq "range" ) { if ( $value =~ /([0-9eE\.\-]+)[\s:;,]+([0-9eE\.\-]+)/ ) { ( $ret[$n]->{byrange}->{bottom} , $ret[$n]->{byrange}->{top} ) = ( $1 , $2 ); print STDERR "# .. .. byrange: $1 .. $2\n"; } else {die;} } elsif ( $key eq "stddev" ) { $ret[$n]->{byaverage}->{stddev} = $value; print STDERR "# .. .. byaverage: $value stddev\n"; } } close CFG; return @ret; } # parse the wsdl file and make proxy my $proxy_blastatlas = WSDL2proxy ( 'http://www.cbs.dtu.dk/ws/BLASTatlas/BLASTatlas_1_0_ws2.wsdl' ); # append schemas $proxy_blastatlas = appendSchemas ( $proxy_blastatlas , "http://www.cbs.dtu.dk/ws/common/ws_common_1_0b.xsd" , "http://www.cbs.dtu.dk/ws/BLASTatlas/ws_blastatlas_1_0_ws2.xsd" ); # parse all operations of the wsdl my %blastops = addOperations ( $proxy_blastatlas ) ; my $genome = $GENOME[0]->{seq}; my @blastfeatures; print STDERR "# building array of translations...\n"; for (my $i = 0 ; $i < scalar ( @PROTEINS ) ; $i++ ) { next unless $PROTEINS[$i]->{id} =~ /CDS_(\d+)\-(\d+)/; my ($begin,$end,$dir); if ( $1 > $2 ) { $dir = "-"; ($begin,$end) = ($2 , $1); } else { $dir = "+"; ($begin,$end) = ($1 , $2); } my $a = { begin => $begin , end => $end , dir => $dir , proteins => $PROTEINS[$i]->{seq}}; push @blastfeatures , $a; } for (my $i = 0 ; $i < scalar ( @ORFS ) ; $i++ ) { next unless $ORFS[$i]->{id} =~ /CDS_(\d+)\-(\d+)/; my ($begin,$end,$dir); if ( $1 > $2 ) { $dir = "-"; ($begin,$end) = ($2 , $1); } else { $dir = "+"; ($begin,$end) = ($1 , $2); } my $a = { begin => $begin , end => $end , dir => $dir , orfs => $ORFS[$i]->{seq} }; push @blastfeatures , $a; } my @atlasfeatures; my @ANNS; print STDERR "# building reference annotations ...\n"; open ANN , $REFERENCE_ANNOTATIONS or die "can't read reference annotations ('$REFERENCE_ANNOTATIONS'):$!\n"; while () { my ($feature,$begin,$end,$dir,$id) = split /\s+/; next unless $feature =~ m/rRNA|tRNA|CDS/; my $a = { begin => $begin, end => $end, type => $feature, dir => $dir , label => $id }; push @atlasfeatures , $a; if ( defined ( $windowEnd ) and defined ( $windowBegin ) ) { if ( $end <= $windowEnd and $begin >= $windowBegin ) { my $dirposnog = 'pos'; $dirposnog = 'neg' if $dir eq '-'; print STDERR "# .. including annotation mark: $feature,$id\n"; my $a = { type => $feature , dir => $dirposnog, label => $id }; push @ANNS , $a; } } } close ANN; my @customMap ; print STDERR "# building custom lanes ...\n"; foreach my $i (0 .. $#customlanes) { my $range_type; $range_type = 'byrange' if defined $customlanes[$i]->{byrange}; $range_type = 'byaverage' if defined $customlanes[$i]->{byaverage}; push @customMap , { values => $customlanes[$i]->{dat} , legend => $customlanes[$i]->{legend} , boxfilter => $customlanes[$i]->{boxfilter}, $range_type => $customlanes[$i]->{$range_type}, color => $customlanes[$i]->{color} }; } my $window; if ( defined ( $windowBegin ) and defined ( $windowEnd ) ) { $window = {begin => $windowBegin , end =>$windowEnd}; } else { $window = undef; } my $request; print STDERR "# building request ...\n"; my @DNAP = split ( ",",$dnap ) ; if ( scalar ( @blastfeatures ) > 0 ) { $request = { parameters => { parameters => { discard_cache => 'no', main => $TITLE, window => $window , modus => $modus, DNAparameters => [ @DNAP ], customMap => [ @customMap ], reference => { genome => $genome , atlasfeature => { feature => [ @atlasfeatures ] } , blastfeature => { feature => [ @blastfeatures ] } , } , ann => [ @ANNS ] , db => [ @DB ] } } }; } else { $request = { parameters => { parameters => { discard_cache => 'no', main => $TITLE, window => $window , modus => $modus, DNAparameters => [ @DNAP ], customMap => [ @customMap ], reference => { genome => $genome , atlasfeature => { feature => [ @atlasfeatures ] } , blastfeature => { } , } , ann => [ @ANNS ] , db => [ @DB ] } } }; } print STDERR "# clearing variables ...\n"; undef @DB; undef @ANNS; undef @atlasfeatures; undef @blastfeatures; undef @dblanes; print STDERR "# submitting request ...\n"; my $job; die unless $job = $blastops{runAtlas}->($request); my $jobid = $job->{parameters}->{queueentry}->{jobid}; &wait_job( $blastops{pollQueue} , $jobid ); die unless my $response = $blastops{fetchAtlasResult}->(job => { jobid => $jobid }); printf STDERR "# fetching raw data: %s\n",$response->{parameters}->{output}->{$o}->{comment}; printf STDERR "# .. encoding: %s\n",$response->{parameters}->{output}->{$o}->{encoding}; printf STDERR "# .. MIMEtype: %s\n",$response->{parameters}->{output}->{$o}->{MIMEtype}; print decode_base64 ( $response->{parameters}->{output}->{$o}->{content} ) ; sub parse_custom_lanes { my $file = $_[0]; my @ret; my $n = -1; print STDERR "# parsing custom lane configuration ($file) ...\n"; open CFG , $file or die "parse_custom_lanes(): can't open $file:$!"; while () { chomp; next unless /^([a-z]+)\s*:\s*(.*)/; my ($key,$value) = ($1,$2); if ( $key eq "legend" ) { $n++; print STDERR "# .. parsing custom data entry $value ...\n"; $ret[$n]->{legend} = $value; } elsif ( $key eq "boxfilter") { $ret[$n]->{boxfilter} = $value; print STDERR "# .. .. boxfilter $value ...\n"; } elsif ( $key eq "color") { my @color_parts = split("_",$value); my @color_part_index; if ( $#color_parts == 2 ) { @color_part_index = ('from','via','to'); } elsif ($#color_parts == 1) { @color_part_index = ('from','to'); } elsif ($#color_parts == 0) { @color_part_index = ('from','to'); unshift @color_parts , "000000"; } else { die "invalid color range: $value\n"; } print STDERR "# .. .. parsing color $value\n"; foreach my $i (0 .. $#color_part_index) { ( $ret[$n]->{color}->{$color_part_index[$i]}->{r}, $ret[$n]->{color}->{$color_part_index[$i]}->{g}, $ret[$n]->{color}->{$color_part_index[$i]}->{b} ) = ($1,$2,$3) if $color_parts[$i] =~ /^(\d{1,2}),*(\d{1,2}),*(\d{1,2})$/; print STDERR "# .. .. .. color $color_part_index[$i]: r:$1, g:$2, b:$3\n"; } } elsif ( $key eq "range") { if ( $value =~ /([0-9eE\.\-]+)[\s:;,]+([0-9eE\.\-]+)/ ) { ( $ret[$n]->{byrange}->{bottom} , $ret[$n]->{byrange}->{top} ) = ( $1 , $2 ); print STDERR "# .. .. byrange: $1 .. $2\n"; } else{ die "wrong range:$value\n"; } } elsif ( $key eq "stddev" ) { $ret[$n]->{byaverage}->{stddev} = $value; print STDERR "# .. .. byaverage: $value stddev\n"; } elsif ( $key eq "source" ) { print STDERR "# .. parsing data source '$value' ...\n"; open DAT , $value or die "unable to open custom data src $value:$!"; my @DAT; my $i = 0; while ( defined ( my $line = ) ) { chomp $line; push @DAT , $line; $i++; print STDERR "# .. .. parsing data source ... $i\r" if $i % 100000 == 0; } close DAT; $ret[$n]->{dat} = join ( ",", @DAT); print STDERR "# .. .. parsing data source ... ".scalar (@DAT)." done\n"; } } return @ret; } sub wait_job { my ($op_handle,$jobid) = @_; my $sleep = 0; print STDERR "# polling job $jobid\n"; my $status = "UNKNOWN"; my $response; while ( $status !~ /FINISHED|FAILED/ ) { $response = $op_handle->( job => { job => { jobid => $jobid } }) ; my $new_status = $response->{queueentry}->{queueentry}->{status}; if ( $new_status ne $status ) { print STDERR "# job $jobid $new_status ($response->{queueentry}->{queueentry}->{datetime})\n"; $status = $new_status; } $sleep = 5 if $status eq "ACTIVE"; sleep $sleep; } die "# ERROR: job $jobid FAILED\n" if $status ne "FINISHED"; } __DATA__ NAME BLASTatlas - Web Services client for CBS BLASTatlas (http://www.cbs.dtu.dk/ws/BLASTatlas) SYNOPSIS BLASTatlas (-h) (-o [FORMAT]) -modus [MODUS] -ref [REFERENCE] -proteins [PROTEOME] -orfs [OPEN READING FRAMES] -blastcfg [BLAST LANES CF] -custocfg [CUSTO>M LANES CF] -ann ANNOTATIONS (-begin [ZOOM BEGIN]) -end [ZOOM END] -dnap [DNA PARAMETERS] -title [TITLE] DESCRIPTION This is an example client script for accessing the CBS BLASTatlas service (SOAP) using XML::Compile. OPTIONS -o [pdf/ps] Output format ps (PostScript) or pdf (Portable Document Format) -modus [linear/circle] Controls the modus of the atlas: Linear or circular representation -ref [FILE] The location of the reference genome sequence in fasta format. Must be in one continous contig. -proteins [FILE] The proteins encoded by the reference genome in fasta format. The ID of each fasta entry must conform to the following format (default output of saco_extract): # ------------------ >CDS_1-1188 comments are ignored MTLDEIRQSIREELDALRVSGARRQDLSLHACKRLFFDLGIRPSTANVRELTQTGSASDI PKDIDHFWERIRSASKVRLEGAALPKPLEEKAGALLGALYEEALKQARESLDGERQQASS EVAAAEQKYRDALIRQEALEAAVARSDARAEQLQSRLTDVEVQLASASTQGLAHQDSLQG LIRRMENENALLNQRLEAEQSQNAALRERIDALHAELRENTEHYAQQIKDAIAEAERRVK PMLVELDSLRSMASTYQQGLRDVNRKEFDFLQQLSAAKARADRLDAQLREQGDELTAALN EVQILRASQGMTPEIAALIRRLANAGNLDAAAFDTIGTSLDQHVTLPARCPKCGDGEPEL SHDAHGYELLCPECEHASGTQPSRFAAAACFTRTT ... # ------------------ For proteins encoded by genes located on the negative strand, format must be as follows: # ------------------ >CDS_2325-1282 protein encoded by gene in negative direction MSLPILTSPPSPPDIPSDLFDRDTSDWLNHPDTAFDAWLAAQEFRRSSADVYRAQWGAFL GWLAKRQKNLATVDTETIALFVGELPIKKTQRVRYLRLIERVLDHVRKNEFGSTNPARFI AQDGEATWRAARDNEPTSFLAPAERAALLAYLFSPLPAAGAALWKERRDRALIAVFLGGG LKTGEARALTISCVKLGSTMLTIPAAQPEFTRETHLASFAIALFDAWLAERQRCGIPGEL VFPASQSGRPMHKATMLRAVDAIVDAADIAQSRTARASPQTLRNTFAAELFENGVEPEKV GQWLGFAQPISSNRLHRAWKNWRESLASALPNAESLTDEISTEPSQA ... # ------------------ -orfs [FILE] The open reading frames of the reference genome. IDs of fasta entries must be of same format (CDS_X-Y) as for the -proteins file. # ------------------ >BX571966_CDS_1-1188 /locus_tag="BPSS0001" /note="No signifi ATGACTCTCGACGAGATCCGGCAATCCATCCGTGAAGAACTCGATGCGCTGCGCGTATCC GGCGCTCGCCGGCAAGACTTATCGCTCCATGCGTGCAAGCGATTGTTCTTCGATCTCGGC ... # ------------------ -customcfg [FILE] # ------------------ legend:SIDD@-0.035 color:000010_101010 range:9:10 boxfilter:5000 source:gunzip -c BX571966-57a2f2c2e11ca0dd8cd74493d667d4d6-3173005.sidd--0.035-c-10-c.out.gz | cut -f4 | # ------------------ -blastcfg [FILE] Configuration of BLAST lanes. Must have the following format # ------------------ legend:B. ubonensis Bu program:tblastn color:101010_040410 range:0,0.8 source:cat ./19539.fsa | legend:B. pseudomallei DM98 program:tblastn color:101010_040410 range:0,0.8 source:cat ./19509.fsa | # ------------------ Be aware of tabs/spaces: tabs separate fields - spaces do not! -dnap [p1,p2,p3 ...] >Intrinsic Curvature DNA curvature is calculated using the CURVATURE programme (Bolshoy et al. 1991, Shpigelman et al. 1993). The term curved DNA here refers to DNA that is intrinsically curved in solution and can be readily characterised by anomalous migration in acrylamide gels. There are different models for curved DNA (Sinden et al. 1998), although the predictions for curvature fragments largerthan a few hundred bp is essentially the same (Haran et al. 1994). The scale is in arbitrary "Curvature units", which ranges from 0 (e.g. no curvature) to 1.0, which is the curvature of DNA when wrapped around the nucleosome. The scale used for this atlas ranges 3 standard deviations around the mean. * R.R. Sinden and C.E. Pearson and V.N. Potaman and D.W. Ussery DNA: Structure and Function (1998) 5A:1-141 * E.S. Shpigelman and E.N. Trifonov and A. Bolshoy CURVATURE: Software for the Analysis of Curved DNA. (1993) 9:435-444 * T.E. Haran and J.D. Kahn and D.M. Crothers Sequences elements responsible for DNA curvature (1994) 225:729-738 * A. Bolshoy and P. McNamara and R.E. Harrington and E.N. Trifonov Curved DNA Without A-A - Experimental Estimation of All 16 DNA Wedge Angles (1991) 88:2312-2316 >Position Preference - a trinucleotide model based on the preferential location of sequences within nucleosomal core sequences (Satchwell et al. 1986). We use the magnitude (e.g.absolute values) of the trinucleotide numbers as a measure of DNA flexibility (Baldi et al. 1996). The trinucleotide values range from essentially zero (0.003, presumably more flexible), to 0.28 (considered rigid). Since very few of the trinucleotide have values close to zero (e.g. little preference for nucleosome positioning), this measureis considered most sensitive towards the low ("flexibity") * S.C. Satchwell and H.R. Drew and A.A. Travers Sequence periodicities in chicken nucleosome core DNA (1986) 191:659-675 * P. Baldi and S. Brunak and Y. Chauvin and A. Krogh Naturally occurring nucleosome positioning signals in human exons and introns. (1996) 263:503-510 >Stacking Energy Base-stacking energies are from the dinucleotide values provided by (Ornstein et al. 1978). The scale is in kcal/mol, and the dinucleotide values range from -3.82 kcal/mol (will melt easily) up to a maximum value of -14.59 kcal/mol (which would require more energy to destack or melt the helix). (All 10 values are listed in the table below.) A positive peak in base-stacking (i.e., numbers closer to 0) reflectsregions of the helix which would de-stack or melt more readily. Conversely, minima (larger negative numbers) in this plot would represent more stable regions of the chromosome. Dinucleotide melting energies in kcal/mols: (GC).(GC) -14.59 (AC).(GT) -10.51 (TC).(GA) -9.81 (CG).(CG) -9.61 (GG).(CC) -8.26 (AT).(AT) -6.57 (TG).(CA) -6.57 (AG).(CT) -6.78 (AA).(TT) -5.37 (TA).(TA) -3.82 * R.L. Ornstein and R. Rein and D.L. Breen and R.D. MacElroy An optimized potential function for the calculation of nucleic acid interaction energies. I. Base stacking (1978) 17:2341-2360 >Protein Deformability "Protein Induced Deformability" dinucleotide values are from protein induced deformation of DNA helices as determined by examination of more than a hundred cr et et al. 1997al structures of DNA/protein complexes (Olson et al. 1998). The dinucleotide values range from 2.1 (the least deformable dinucleotide), to 12.1 (i.e., the dinucleotide step (CpG), which is often deformed by proteins). Thus, on this scale, a larger value reflects a more deformable sequence whilst a smaller value indicates a region where the DNA helix is less likely to be changed dramatically by proteins. The average protein deformability value in the entire E. coli K-12 genome is 5.12. * Goffeau et al. The Yeast Genome Directory (1997) 387 (supplement):5-105 * W.K. Olson and A.A. Gorin and X.J. Lu and L.M. Hock and V.B. Zhurkin DNA sequence-dependent deformability deduced from protein-DNA crystal complexes. (1998) 95:11163-11168 >Propeller twist We use propeller twist as a measure of helix rigidity, since the propeller twist angles have been shown to be inversely related to rigidity of the DNA helix in crystals (el Hassan et al. 1996). Thus, a region with high propeller twist would mean the helix is quite rigid in this area, and similarly regions that are quite flexible would have a low propeller twist. Propeller twist values were obtained from cr et et al. 1997allographic data (el et al. 1996), with the exception of the TA step, which was taken from a theoretical estimate (Gorin et al. 1995). Plots using other sets of propeller twist dinucleotide values were very similar (data not shown). The average propeller twist value in the entire E. coli K-12 genome is -12.63 degrees. * Goffeau et al. The Yeast Genome Directory (1997) 387 (supplement):5-105 * M.A. el Hassan and C.R. Calladine Propeller-twisting of base-pairs and the conformational mobility of dinucleotide steps in DNA. (1996) 259:95-103 * A.A. Gorin and V.B. Zhurkin and W.K. Olson B-DNA twisting correlates with base-pair morphology. (1995) 247:34-48 >DNase I Sensitivity DNase I values are based on experimentally determined trinucleotide values (Brukner et al. 1995, Brukner et al. 1995). These values are reflectiveof the anisotropic flexibility or "bendability" of a particular DNAsequence. The trinucleotide values range from -0.280 (rigid) to +0.194 (very "bendable" towards the major groove). Smoothing over a large regions, (which is necessary for viewing entire genomes) tends to smooth out differences in bendability. The average DNase I ("bendability") value in the * I. Brukner and R. Sanchez and D. Suck and S. Pongor Sequence-dependent bending propensity of DNA as revealed by DNase I: parameters for trinucleotides. (1995) 14:1812-1818 * I. Brukner and R. Sanchez and D. Suck and S. Pongor Trinucleotide models for DNA bending propensity: comparison of models based on DNaseI digestion and nucleosome packaging data. (1995) 13:309-317 >Palindromic hexamers For a given sequence, any palindrome of 6 nt (e.g., AAATTT) is given a value of 1, while all bases not included inpalindromic hexamers are given a value of 0 (van et al. 2003). * van Noort V, Worning P, Ussery DW, Rosche WA, Sinden RR Strand misalignments lead to quasipalindrome correction (2003) 19:365-9 >G Content The "G Content" of a given sequence is merely the fraction of G's in a given sequence (Jensen et al. 1999). It can range from 0(no G's), to 1 (all G's). For a sequence that is 50% AT content, one would expect roughly 25% G's. * L. J. Jensen and C. Friis and D.W. Ussery Three views of complete chromosomes (1999) 150:773-777 >A Content The "A Content" of a given sequence is merely the fraction of A's in a given sequence (Jensen et al. 1999). It can range from 0(no A's), to 1 (all A's). For a sequence that is 50% AT content, one would expect roughly 25% A's. * L. J. Jensen and C. Friis and D.W. Ussery Three views of complete chromosomes (1999) 150:773-777 >T Content The "T Content" of a given sequence is merely the fraction of T's in a given sequence (Jensen et al. 1999). It can range from 0(no T's), to 1 (all T's). For a sequence that is 50% AT content, one would expect roughly 25% T's. * L. J. Jensen and C. Friis and D.W. Ussery Three views of complete chromosomes (1999) 150:773-777 >C Content The "C Content" of a given sequence is merely the fraction of C's in a given sequence (Jensen et al. 1999). It can range from 0(no C's), to 1 (all C's). For a sequence that is 50% AT content, one would expect roughly 25% C's. * L. J. Jensen and C. Friis and D.W. Ussery Three views of complete chromosomes (1999) 150:773-777 >GC Skew For many genomes there is a strand bias, such that one strand tends to have more G's, whilst the other strand has more C's.This GC-skew bias can be measured the number of G's minus the number of C's over a fixed length (e.g. 10,000 bp) of DNA(Jensen et al. 1999). The values can range from +1 (all G's on the examined sequence, with all C's on the other strand), to -1(the reverse case - all C's on the examined sequence, and all G's on the other strand). There is a correlation with GC-skewand the replication leading and lagging strands. * L. J. Jensen and C. Friis and D.W. Ussery Three views of complete chromosomes (1999) 150:773-777 >Percent AT The percent AT is a running average of the AT content, over a given window size. Typically for a bacterial genomes of about5 Mbp, the window size is 10,000 bp. The Percent AT can range from 0 (no AT content) to 1 (100% AT). The Percent AT iscorrelated with other DNA structural features, such that AT rich regions are often more readily melted, tend to be lessflexible and more rigid, although they can also be readily compacted chromatin proteins (Pedersen et al. 2000). * A.G. Pedersen and L.J. Jensen and H.H. St\aerfeldt and S. Brunak and D.W. Ussery A DNA structural atlas of \textitE. coli (2000) 299:907-930 >AT Skew For some genomes there is also an AT strand bias, such that one strand tends to have more A's, whilst the other strand hasmore T's. This AT-skew bias is measured as the number of A's minus the number of T's over a fixed length (e.g. 10,000 bp) ofDNA (Jensen et al. 1999). The values can range from +1 (all A's on the examined sequence, with all T's on the other strand), to-1 (the reverse case - all T's on the examined sequence, and all A's on the other strand). For some genomes, there is acorrelation with AT-skew and the replication leading and lagging strands. * L. J. Jensen and C. Friis and D.W. Ussery Three views of complete chromosomes (1999) 150:773-777 >Global Direct Repeats Global Direct repeats are found by taking the first 100 bp of sequence, and looking for the best match within the whole segment, on the same strand, in the same direction [5' to 3'] (Skovgaard et al. 2002). Values are binned into 10 values, and represent the lower end of the best match, and range from 0 (10% or less match) to 9 (at least 90 out of the 100 nucleotides match perfectly). >Global Inverted Repeats Global Direct repeats are found by taking the first 100 bp of sequence, and looking for the best match within the whole segment, on the opposite strand, in the same direction [5' to 3'] (Skovgaard et al. 2002). Values are binned into 10 values, and represent the lower end of the best match and range from 0 (10% or less match) to 9 (at least 90 out of the 100 nucleotides match perfectly). * M. Skovgaard and L.J. Jensen and C. Friis and H.H. Staerfeldt,and P. Worning and S. Brunak The Atlas Visualization of Genomewide Information (2002) 33:49-63 >Direct Repeats Local Direct repeats are found by taking a 100 bp sequence window, and looking for the best match of a 30 bp piece withinthat window, on the same strand, in the same direction (Jensen et al. 1999). Values can range from 0 (no match at all) to 1(one or more perfect match within the window). * L. J. Jensen and C. Friis and D.W. Ussery Three views of complete chromosomes (1999) 150:773-777 >Everted Repeats Local Everted repeats are found by taking a 100 bp sequence window, and looking for the best match of a 30 bp piece withinthat window, on the opposite strand, in the same direction (Jensen et al. 1999). Values can range from 0 (no match at all) to 1(one or more perfect match within the window). * L. J. Jensen and C. Friis and D.W. Ussery Three views of complete chromosomes (1999) 150:773-777 >Local Inverted Repeats Local Inverted repeats are found by taking a 100 bp sequence window, and looking for the best match of a 30 bp piece withinthat window, on the opposite strand, in the opposite direction (Jensen et al. 1999). Values can range from 0 (no match at all)to 1 (one or more perfect match within the window). * L. J. Jensen and C. Friis and D.W. Ussery Three views of complete chromosomes (1999) 150:773-777 >Mirror Repeats Local Mirror repeats are found by taking a 100 bp sequence window, and looking for the best match of a 30 bp piece withinthat window, on the same strand, in the opposite direction (Jensen et al. 1999). Values can range from 0 (no match at all) to 1(one or more perfect match within the window). * L. J. Jensen and C. Friis and D.W. Ussery Three views of complete chromosomes (1999) 150:773-777 >Quasi-palindromes "Quasi-palindromes" are short inverted repeats, which are found by taking a 30 bp piece of sequence, and looking for matcheswith at least 6 out of 7 nt matching, on the opposite strand, in the opposite direction (van et al. 2003). Values canrange from 0 (no match at all) to 1 (one or more perfect match within the window). * van Noort V, Worning P, Ussery DW, Rosche WA, Sinden RR Strand misalignments lead to quasipalindrome correction (2003) 19:365-9 >Perfect-palindromes "Perfect-palindromes" are short inverted repeats, which are found by taking a 30 bp piece of sequence, and looking forperfect matches of 7 nt or longer, on the opposite strand, in the opposite direction (van et al. 2003). Values can rangefrom 0 (no match at all) to 1 (one or more perfect match within the window). * van Noort V, Worning P, Ussery DW, Rosche WA, Sinden RR Strand misalignments lead to quasipalindrome correction (2003) 19:365-9 >Simple Repeats A "simple repeat" is a region which contains a simple oligonucleotide repeat, like microsattelites. Simple repeats are foundby looking for tandem repeats of length R within a 2R-bp window. By using the values 12, 14, 15, 16, and 18 for R, allsimple repeats of lengths 1 through 9 are calculated, of length of at least 24 bp (Jensen et al. 1999). Values can range from 0(no match at all) to 1 (one or more perfect match within the window). * L. J. Jensen and C. Friis and D.W. Ussery Three views of complete chromosomes (1999) 150:773-777 EXAMPLE BLASTatlas-server1.0ws2_client1.1 --customcfg=custom-1.1.cfg --modus=circle \ --ref=BX571966.fsa --proteins=BX571966.proteins.fsa --blastcfg=blast-1.1.cfg \ --ann=BX571966.cdsPLUScbsrna.ann --title="B. pseudomallei K96243 Chr. II" \ --dnap="Intrinsic Curvature,Stacking Energy,Position Preference,Percent AT" \ -o ps > BLASTatlas.ps SEE ALSO saco_convert, saco_extract AUTHOR Peter Fischer Hallin, October 2008, pfh@cbs.dtu.dk