diff SMART/bacteriaRegulatoryRegion_Detection/seedGff.pl @ 18:94ab73e8a190

Uploaded
author m-zytnicki
date Mon, 29 Apr 2013 03:20:15 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/SMART/bacteriaRegulatoryRegion_Detection/seedGff.pl	Mon Apr 29 03:20:15 2013 -0400
@@ -0,0 +1,91 @@
+#!/usr/bin/perl -w
+###
+# But : extension des UTR5 à partir des clusters de reads
+# 
+# Entrees : fichier gff annotation + cluster
+#
+# Sortie : UTR5.gff
+#
+###------------------------------------------------------      
+use vars qw($USAGE);                      
+use strict;   
+use Getopt::Long;                            
+
+=head1 NAME
+
+seedGff.pl  
+
+=head1 SYNOPSIS
+
+% seedGff.pl -i annotation.gff -p BeginPosFromAtg [-l lgSeed | -e EndPosFromAtg ] [-h] 
+
+=head1 DESCRIPTION
+This script will parse input gff file and write information in gff3 format.
+
+    -i|--input fileName  	    gff input file name of annotations
+    -p|--pos BeginPosFromAtg 	greather positive number for the begin position of the seed from Atg 
+   [-l|--length seedLength]   	lentgth of the seed to compute (default 4nt)
+   [-e|--end seedEnd]  		    end of the seed to compute (smaller positive number)
+    -o|--output fileName  	    gff output file name
+   [-h|--help]           	    help mode then die                              
+
+=head1 AUTHOR - Claire Toffano-Nioche - mar.11
+    from Claire Kuchly initial script
+
+=cut
+#-----------------------
+my ($inFileName, $beginSeed, $endSeed, $lgSeed, $outFileName) = ("", 0, 0, 0, "SEED.gff") ;
+   # command line check
+    foreach my $num (0 .. $#ARGV) {
+        SWITCH: for ($ARGV[$num]) {
+        /--input|-i/ && do { 
+		$inFileName=$ARGV[$num+1]; 
+		open (INGFF, "< $inFileName" ) or die "Can't open gff file: \"$inFileName\"\n" ; 
+		last };
+        /--pos|-p/ && do { 
+		$beginSeed=$ARGV[$num+1]; 
+		last };
+        /--end|-e/ && do { 
+		$endSeed=$ARGV[$num+1]; 
+		last };
+        /--length|-l/ && do { 
+		$lgSeed=$ARGV[$num+1]; 
+		last };
+        /--output|-o/ && do { 
+		$outFileName=$ARGV[$num+1]; 
+		last };
+        /--help|-h/ && do { exec("pod2text $0\n") ; die };
+        }
+    }
+    open(UTR5,">$outFileName") or die "Error can't $outFileName open for output. $!\n";
+    if (($endSeed > 0) and ($lgSeed > 0)) {
+	print "Error : only -e or -l definition, not both\n";
+	exec("pod2text $0\n") ; die ;
+    } elsif ($lgSeed > 0) {
+	print "ERROR : Lg Seed => TODO \n";
+    }
+
+    #Création des fichiers de filtres (séquences UTR) :
+        #print "Création des fichiers de séquences !\n";
+###Creer les fichiers des séquences en 5' et 3' des gènes.
+###Seed pour les clusters en 5' : il faut qu'ils soient encrés sur le -20 par rapport à l'ATG. Donc seed de -22/-18.
+    while(my $ligne = <INGFF>){
+		chomp($ligne);
+		my @list = split(/\t/,$ligne) ;
+		my $finUTR5 ;
+		my $debUTR5 ;
+		my $strand = $list[6] ;
+		if($strand eq "+"){
+			$finUTR5 = $list[3]-$endSeed;
+			$debUTR5 = $list[3]-$beginSeed;
+		} elsif($strand eq "-"){
+			$debUTR5 = $list[4]+$endSeed;
+			$finUTR5 = $list[4]+$beginSeed;
+		}
+		if($debUTR5 < 0){$debUTR5 =0;}
+		if($finUTR5 < 0){$finUTR5 =0;}
+		print UTR5 "$list[0]\t$list[1]\t5UTR\t$debUTR5\t$finUTR5\t$list[5]\t$list[6]\t$list[7]\t$list[8]\n"; 
+    }
+    close INGFF;
+    close UTR5;
+exit(0);