The second problem was to find a program to search the motifs in any sequence, I did not find any updated program (many are published, but few websites are active nowadays).

So, I decided to write my own program, I share the code if anyone can use it:

Any nice script to color my code inside Blogger?

#!/usr/bin/perl -w

use strict;

use Getopt::Long;

=head1 NAME

jasparScan.pl

=head1 USAGE

jasparScan.pl -f FASTA -m MATRIX [-s MODELS] [-c CUT_OFF]

OPTIONS:

-h --help Help screen

-f --fasta Fasta file to scan

-m --matrix Jaspar motif matrix

-s --models List of models, separated by ':' [Def: all]

-c --cutoff Similarity cut-off [Def: 0.90]

=head1 DESCRIPTION

Script to map a TF binding site matrix (from Jaspar DB)

into a fasta sequence. The similarity is defined as a

value between 0-1 (1 is the best match). Base by base

are compared in the matrix frequencies and normalized

by the best match expected. This scoring system is based

on a reduced MatInspector similarity matrix (Quandt et

al, 1995).

=cut

# Program parameters

my $fasta = undef;

my $matrix = undef;

my $models = undef;

my $help = undef;

my $cutoff = 0.9;

usage() unless (GetOptions('help|h' =>\$help,

'fasta|f=s' =>\$fasta,

'matrix|m=s'=>\$matrix,

'models|s:s'=>\$models,

'cutoff|c:s'=>\$cutoff ));

usage() if(defined $help);

usage() unless(defined $fasta and defined $matrix);

# Header print

print "# jasperScan.pl\n";

print "# FASTA=$fasta, MATRIX=$matrix, CUTOFF=$cutoff";

if (defined $models) { print ", MODELS=$models"; }

print "\n";

# Global variables

my %seqs = ();

my %mods = ();

my $sid = '';

my $seq = '';

my $num_seq = 0;

my $num_mod = 0;

# Loading sequences

$num_seq = loadSeqs($fasta);

if ($num_seq >= 1) {

print "# loaded $num_seq sequences\n";

} else {

die "no valid sequences\n";

}

# Loading matrix models

if (defined $models) {

$num_mod = loadMods($matrix, $models);

} else {

$num_mod = loadMods($matrix);

}

if ($num_mod <= 1) {

print "# loaded $num_mod models\n"

} else {

die "no valid models\n";

}

print "# SEQ\tPOS\tTFBS\tSIMIL\tRAW_SIM\tBEST_SIM\n";

# Main search

while ( ($sid, $seq) = each %seqs ) {

foreach my $mod (keys %mods) {

my $len = $mods{$mod}{'len'};

my $best = $mods{$mod}{'best'};

for (my $i = 0; $i <= ((length $seq) - $len); $i++) {

my $word = substr($seq, $i, $len);

next if($word =~ /[^ACGT]/);

my $sco = 0;

my $sco_n = 0;

for (my $j = 0; $j <= ($len - 1); $j++) {

my $b = substr($word, $j, 1);

$sco += $mods{$mod}{$b}[$j];

}

$sco /= $len;

$sco_n = $sco / $best;

next unless ($sco_n >= $cutoff);

my $ini = $i + 1;

my $end = $ini + $len;

print "$sid\t$ini-$end\t$mod\t$sco_n\t$sco\t$best\n";

}

}

}

=head1 SUBROUTINES

=cut

sub loadSeqs {

my $f = shift @_;

my $n = 0;

open F, "$f" or die "cannot open $f\n";

while (<F>) {

chomp;

if (/>(.+)/) {

$sid = $1;

$n++;

} else {

$seqs{$sid} .= $_;

}

}

close F;

return $n;

}

sub loadMods {

my $file = shift @_;

my $sel = shift @_;

my %sel = ();

my $n = 0;

if (defined $sel) {

my @sel = split (/:/, $sel);

foreach my $elem (@sel) { $sel{$elem}++; }

}

local $/ = "\n>";

open F, "$file" or die "cannot open $file\n";

while (<F>) {

if (defined $sel) {

/MA\d{4}/;

my $id = $&;

next unless(defined $sel{$id});

}

s/>//g;

s/\]//g;

s/\[//g;

my ($name, $fA, $fC, $fG, $fT) = split (/\n/, $_);

my @fA = split (/\s+/, $fA); shift @fA;

my @fC = split (/\s+/, $fC); shift @fC;

my @fG = split (/\s+/, $fG); shift @fG;

my @fT = split (/\s+/, $fT); shift @fT;

my $best = 0;

my $len = 0;

for (my $i = 0; $i <= $#fA; $i++) {

my $sum = $fA[$i] + $fC[$i] + $fG[$i] + $fT[$i];

$best += max($fA[$i],$fC[$i],$fG[$i],$fT[$i])/$sum;

$mods{$name}{'A'}[$i] = $fA[$i] / $sum;

$mods{$name}{'C'}[$i] = $fC[$i] / $sum;

$mods{$name}{'G'}[$i] = $fG[$i] / $sum;

$mods{$name}{'T'}[$i] = $fT[$i] / $sum;

$len++;

}

$mods{$name}{'best'} = $best / $len;

$mods{$name}{'len'} = $len;

$n++;

}

close F;

return $n;

}

sub max {

my $max = shift @_;

foreach my $x (@_) {

$max = $x if($x > $max);

}

return $max;

}

sub usage {

print <<_THIS_

jasparScan.pl -f FASTA -m MATRIX [-s MODELS] [-c CUT_OFF]

Script to map a TF binding site matrix (from Jaspar DB)

into a fasta sequence.

OPTIONS:

-h --help Help screen

-f --fasta Fasta file to scan

-m --matrix Jaspar motif matrix

-s --models List of model to use, separated by ':' [Def: all]

-c --cutoff Similarity cut-off [Def: 0.9]

Juan Caballero @ ISB 2010

_THIS_

;

exit 1;

}

=head1 AUTHOR

Juan Caballero

Institute for Systems Biology @ 2010

=head1 LICENSE

This is free software: you can redistribute it and/or modify

it under the terms of the GNU General Public License as published by

the Free Software Foundation, either version 3 of the License, or

(at your option) any later version.

This is distributed in the hope that it will be useful,

but WITHOUT ANY WARRANTY; without even the implied warranty of

MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

GNU General Public License for more details.

You should have received a copy of the GNU General Public License

along with code. If not, see <http://www.gnu.org/licenses/>.

=cut

## No comments:

## Post a Comment