annotate interpro/paso4.pl @ 0:c342ebb50f0b draft default tip

Uploaded
author fernando
date Thu, 22 May 2014 05:09:07 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
1 #!/usr/bin/perl -w
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
2 $| = 1;
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
3
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
4 # Dado un fichero en formato GFF3 que incluye el análisis de varias secuencias,
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
5 # el programa devuelve un fichero de texto que incluye los valores comunes a todas las secuencias dadas
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
6 # para los atributos:
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
7 # Name -> Entrada de la base de datos de donde se ha obtenido una característica determinada;
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
8 # Ontology_term -> Entradas de Gene Ontology para una característica dada;
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
9 # Dbxref -> Entra de Interpro para una característica dada.
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
10
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
11 use strict;
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
12
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
13
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
14 # Declaración e inicialización de variables
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
15
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
16 my $fichero_ent = ""; #Nombre de fichero en formato GFF3 a analizar tomado de líneas de comandos
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
17 my $output=""; #Fichero de salida pasado como parámetro
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
18 my (@id_1, @id_2) = (); #Almacenan temporalmente las líneas directivas e ID de las secuencias
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
19 my @ids = (); #Todos los ID-seq del archivo
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
20 my @temp = (); #Todas las líneas de características del archivo
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
21 my $lin = ""; #Recupera cada ID del @ids
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
22 my @lin_id = (); #Todas las líneas de características para un ID determinado
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
23 my (@t1, @t3) = (); #Almacenan temporalmente las características y atributos de una línea dada
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
24 my $atributo = ""; #Únicamente la característica "Atributos" de cada línea de características
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
25 my @etiquetas = ("Name","Ontology_term","Dbxref"); #Son los tres tipos de atributos comunes que se van a extraer del fichero
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
26 my @sel_atrib = (); #Los atributos correspondientes a una etiqueta dada en cada fila
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
27 my ($etiq, $atrib) = ""; #La etiqueta y el valor respectivamente, de un atributo en una línea
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
28 my @val_atrib = (); #Los diferentes posibles valores de un atributo en una línea
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
29 my (@valores, @valores_rep) = ([],[],[]); #Valores de cada atributo para cada ID-seq y los valores repetidos para todos los ID-seq
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
30 my @repetidos = (); #Almacena temporalmente los valores comunes para cada atributo, entre los ID-seq analizados
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
31
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
32
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
33 ######## Abrir fichero y seleccionar lineas #########
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
34
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
35 ($fichero_ent,$output) = @ARGV;
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
36
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
37 open(ARCHGFF3, $fichero_ent) || die "Failure to open the file \"$fichero_ent\"\n\n"; #Abre el fichero
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
38
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
39 while (<ARCHGFF3>) { #Lee el archivo
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
40 chomp $_;
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
41 if ($_ =~ /^##FASTA/) { #Elimina la parte de secuencias fasta
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
42 last;
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
43 }elsif ($_ =~ /^##sequence-region./) {
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
44 push (@id_1, $_) #Las líneas directivas de sequence-region, para obtener su ID correspondiente
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
45 }elsif ($_ =~ /^#+/) { #Elimina líneas de comentarios y directivas, excepto el tipo anterior
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
46 next;
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
47 }else { push (@temp, $_)}; #Almacena las líneas con los atributos de todos los ID
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
48 };
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
49 close ARCHGFF3;
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
50
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
51 ########### Seleccionar ID de la línea directiva ##################
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
52
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
53 foreach (@id_1) {
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
54 @id_2 = split(/\s/,$_,3);
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
55 push (@ids, $id_2[1]); #Almacena todos los ID-seq que hay en el archivo
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
56 }
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
57
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
58 ########### Seleccionar un ID-seq determinado y todas sus filas de características ##################
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
59
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
60 my $i = 0; #Para distinguir entre la primera secuencia y el resto
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
61 foreach $lin (@ids) {
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
62 @lin_id = grep (/^$lin/, @temp); #Todas las líneas de características correspondientes al ID seleccionado
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
63
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
64 ########### Seleccionar, para cada línea de características, la columna novena de atributos ##################
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
65
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
66 foreach (@lin_id) {
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
67 @t1 = split(/\t/,$_); #Cada elemento es una característica de la línea dada
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
68 $atributo = $t1[8]; #Únicamente la característica "atributos" de la línea dada
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
69
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
70 #Seleccionar los atributos "Name", "Ontology_term" y "Dbxref" de la columna 9
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
71
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
72 @t3 = split(/;/, $atributo); #Cada elemento es un atributo de la característica "Atributos" de una línea dada
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
73
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
74 ########### Almacenar los diferentes valores de cada atributo (Name, Ontology_term y Dbxref) #############
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
75 ########### de un ID_seq determinado en un @rray diferente #############
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
76
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
77 for my $cont (0..2){
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
78 if (@sel_atrib = grep (/^$etiquetas[$cont]./, @t3)) { #Evitar valores no definidos para un atributo concreto
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
79 ($etiq, $atrib) = split (/=/, $sel_atrib[0], 2);
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
80 @val_atrib = split (/,/, $atrib);
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
81 foreach my $valor (@val_atrib) {
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
82 if (!grep (/$valor/, @{$valores[$cont]})) { #Evitar valores repetidos de un mismo atributo para un ID-seq determinado
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
83 push (@{$valores[$cont]}, $valor); #Todos los valores diferentes de cada atributo para un ID-seq determinado
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
84 }
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
85 }
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
86 }
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
87 }
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
88 }
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
89
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
90 ########### Comprobar los atributos comunes a todos los ID-seq y guardarlos #################
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
91
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
92 if ($i == 0) { #Para el primer ID-seq se guardan todos sus atributos
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
93 for my $cont (0..2) { #no repetidos. Como mucho, serán todos estos la solución.
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
94 $valores_rep[$cont] = [@{$valores[$cont]}];
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
95 $valores[$cont] = [];
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
96 }
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
97 $i++;
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
98 } else {
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
99 for my $cont (0..2) {
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
100 foreach my $valor (@{$valores[$cont]}) {
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
101 if (grep (/$valor/, @{$valores_rep[$cont]})) { #Búsqueda valores comunes para cada tipo atributo entre los
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
102 push (@repetidos, $valor); #ID-seq analizados.
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
103 }
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
104 }
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
105 $valores_rep[$cont] = [@repetidos]; #Todos los valores comunes de los tres tipos de atributos para
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
106 $valores[$cont] = []; #todas las secuencias del fichero.
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
107 @repetidos = ();
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
108 }
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
109 }
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
110
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
111 }
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
112
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
113
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
114 ############ Impresión de resultados ######################
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
115
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
116 if (!open(FICHEROUT, ">$output")) {
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
117 print "The file \"$output\" can not be opened";
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
118 } else {
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
119 print FICHEROUT "The common atributes of the sequences ","@ids"," are:\n";
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
120 for my $cont (0..2) {
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
121 print FICHEROUT "For atribute ","$etiquetas[$cont]" ," : ";
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
122 if (scalar @{$valores_rep[$cont]} == 0) {
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
123 print FICHEROUT "No common atributes \n";
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
124 }else { print FICHEROUT "@{$valores_rep[$cont]}", "\n"}
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
125 }
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
126 close (FICHEROUT);
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
127 }
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
128 exit;
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
129
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
130
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
131
c342ebb50f0b Uploaded
fernando
parents:
diff changeset
132