annotate PGAP-1.2.1/Converter_NCBINewFormatData.pl @ 10:f33df1459a4b draft

Uploaded
author dereeper
date Mon, 28 Jun 2021 19:26:50 +0000
parents 83e62a1aeeeb
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
1 #!/usr/bin/perl
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
2 use strict;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
3 use warnings;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
4 use Getopt::Long;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
5
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
6
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
7
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
8 my %optionHash=qw();
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
9 my $inprefix="";
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
10 my $outprefix="";
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
11 my $indir="";
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
12 my $outdir="";
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
13 my @inList;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
14 my @outList;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
15
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
16 GetOptions(\%optionHash,"inprefix:s","outprefix:s","indir:s","outdir:s","help|h!");
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
17
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
18
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
19 if(scalar(keys %optionHash)==0){
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
20 &print_usage("");
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
21 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
22
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
23 if(exists($optionHash{"h"}) or exists($optionHash{"help"}) ){
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
24 &print_usage("");
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
25 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
26
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
27 if( exists($optionHash{"inprefix"} ) ){
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
28 $inprefix = $optionHash{"inprefix"};
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
29 @inList=split(/\+/,$inprefix);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
30 }else{
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
31 &print_usage("--inprefix should be provided!");
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
32 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
33
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
34 if( exists($optionHash{"outprefix"} ) ){
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
35 $outprefix = $optionHash{"outprefix"};
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
36 @outList=split(/\+/,$outprefix);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
37 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
38
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
39 if( exists($optionHash{"indir"} ) ){
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
40 $indir = $optionHash{"indir"};
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
41 }else{
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
42 &print_usage("--indir should be provided!");
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
43 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
44
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
45 if( exists($optionHash{"outdir"} ) ){
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
46 $outdir = $optionHash{"outdir"};
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
47 }else{
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
48 &print_usage("--outdir should be provided!");
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
49 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
50
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
51
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
52 if( $inprefix eq "" or $indir eq "" or $outdir eq "" ){
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
53 &print_usage("");
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
54 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
55
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
56 if( scalar(@outList) >0 and scalar(@outList) != scalar(@inList) ){
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
57 &print_usage("If outprefix was provided, the name number should be identical with inprefix");
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
58 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
59
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
60
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
61 system("mkdir -p $outdir");
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
62
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
63 foreach my $idx (0 .. (scalar(@inList) -1) ){
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
64 my $inname = $inList[$idx];
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
65 my $feature_table = $indir."/".$inname."_feature_table.txt";
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
66 my $faa = $indir."/".$inname."_protein.faa";
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
67 my $fna = $indir."/".$inname."_genomic.fna";
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
68
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
69 my %genePositionHash;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
70 my %genomeSequenceHash;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
71 my %geneAnnotationHash;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
72 my %gene2genomeHash;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
73 my %genefaaHash;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
74 my $count=0;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
75
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
76 my $outname;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
77
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
78 if($outprefix eq ""){
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
79 my @tmp=split(/\./,$inname);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
80 $outname = $tmp[0];
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
81 }else{
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
82 $outname = $outList[$idx];
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
83 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
84
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
85 # check file
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
86 if( ! -e $feature_table){
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
87 print "$feature_table was not found!\n";
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
88 print "$inname was skipped!\n";
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
89 next;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
90 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
91
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
92 if( ! -e $faa){
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
93 print "$faa was not found!\n";
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
94 print "$inname was skipped!\n";
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
95 next;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
96 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
97
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
98 if( ! -e $fna){
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
99 print "$fna was not found!\n";
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
100 print "$inname was skipped!\n";
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
101 next;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
102 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
103
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
104 # read feature
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
105 &read_feature($feature_table,\%geneAnnotationHash,\%genePositionHash,\%gene2genomeHash);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
106
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
107 # get genome sequence
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
108 &read_genomeSequence($fna,\%genomeSequenceHash);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
109
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
110 # get faa
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
111 &read_faa($faa,\%genefaaHash);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
112
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
113 # extract nuc and output
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
114
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
115 open(PEP,">$outdir/$outname.pep");
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
116 open(NUC,">$outdir/$outname.nuc");
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
117 open(FUN,">$outdir/$outname.function");
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
118
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
119 foreach my $mygene (keys %genePositionHash){
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
120 if( (!exists( $geneAnnotationHash{$mygene})) or (!exists($gene2genomeHash{$mygene})) or (!exists($genefaaHash{$mygene})) or (!exists($genomeSequenceHash{$gene2genomeHash{$mygene}})) ){
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
121 print $inname . "\t" . $mygene . "\t" . "skipped for insufficient data!\n";
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
122 next;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
123 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
124
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
125 my $nuc = &getSeq($genePositionHash{$mygene}, $genomeSequenceHash{$gene2genomeHash{$mygene}} );
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
126
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
127 if( $nuc eq ""){
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
128 print $inname . "\t" . $mygene . "\t" . "skipped for insufficient data!\n";
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
129 next;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
130 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
131
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
132 print PEP ">$mygene\n$genefaaHash{$mygene}\n";
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
133 print NUC ">$mygene\n$nuc\n";
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
134 print FUN "$mygene\t-\t$geneAnnotationHash{$mygene}\n";
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
135 $count++;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
136 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
137 close(PEP);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
138 close(NUC);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
139 close(FUN);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
140
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
141 print $inname . " -> $outname: $count genes were extracted in total.\n";
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
142 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
143
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
144
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
145 sub getSeq(){
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
146 (my $pos,my $genomeSeq)=@_;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
147 my @tmp=split(/\|/,$pos);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
148 if( $tmp[1]> length($genomeSeq) ){
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
149 return "";
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
150 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
151 my $seq = substr($genomeSeq,$tmp[0]-1,$tmp[1]-$tmp[0]+1);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
152 if($tmp[2] eq "-"){
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
153 $seq = &rcseq($seq);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
154 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
155 return $seq;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
156 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
157
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
158
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
159
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
160 sub rcseq(){
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
161 (my $seq)=@_;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
162 $seq=uc($seq);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
163
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
164 $_=$seq;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
165 tr/ACGT/TGCA/;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
166 $seq = $_;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
167 return reverse($_);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
168 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
169
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
170
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
171
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
172 sub read_faa(){
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
173 (my $infile, my $seqHash)=@_;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
174 my $seq="";
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
175 my $name="";
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
176 my @content;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
177 open(F,$infile);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
178 @content=<F>;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
179 close(F);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
180 chomp(@content);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
181
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
182 for (my $line = 0; $line < @content; $line++) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
183 if($content[$line] =~/^>/ ){
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
184 if($name ne ""){
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
185 $$seqHash{$name}=$seq;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
186 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
187 my @tmp=split(/\s+/,$content[$line]);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
188 @tmp=split(/\./,$tmp[0]);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
189
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
190 $name=substr($tmp[0], 1, length($tmp[0])-1);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
191 $seq="";
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
192 }else{
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
193 $seq = $seq . $content[$line];
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
194 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
195 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
196 $$seqHash{$name}=$seq;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
197 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
198
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
199
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
200
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
201
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
202
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
203
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
204 sub read_genomeSequence(){
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
205 (my $infile,my $seqHash)=@_;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
206 my $seq="";
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
207 my $name="";
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
208 my @content;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
209 open(F,$infile);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
210 @content=<F>;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
211 close(F);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
212 chomp(@content);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
213
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
214 for (my $line = 0; $line < @content; $line++) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
215 if($content[$line] =~/^>/ ){
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
216 if($name ne ""){
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
217 $$seqHash{$name}=$seq;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
218 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
219 my @tmp=split(/\s+/,$content[$line]);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
220 $name=substr($tmp[0], 1, length($tmp[0])-1);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
221 $seq="";
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
222 }else{
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
223 $seq = $seq . $content[$line];
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
224 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
225 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
226 $$seqHash{$name}=$seq;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
227 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
228
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
229
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
230
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
231 sub read_feature(){
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
232 (my $infile, my $annHash,my $posHash,my $gene2genomeHash)=@_;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
233 my @content;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
234 open(F,$infile);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
235 @content=<F>;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
236 close(F);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
237 chomp(@content);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
238
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
239 for (my $line = 1; $line < @content - 1; $line=$line+1) {
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
240 my @row=split(/\t/,$content[$line]);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
241 my @nextrow=split(/\t/,$content[$line+1]);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
242
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
243 if($row[1] ne "protein_coding"){
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
244 next;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
245 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
246
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
247 # in case
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
248 if( $row[7]."|".$row[8]."|".$row[9] ne $nextrow[7]."|".$nextrow[8]."|".$nextrow[9]){
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
249 next;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
250 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
251
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
252 # print $nextrow[10]."\n";
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
253 # print $row[7]."|".$row[8]."|".$row[9]."\n";
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
254
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
255 my @tmp=split(/\./,$nextrow[10]);
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
256 my $geneName = $tmp[0];
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
257
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
258 $$annHash{$geneName}=$nextrow[13];
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
259 $$posHash{$geneName}=$row[7]."|".$row[8]."|".$row[9];
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
260 $$gene2genomeHash{$geneName}=$row[6];
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
261 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
262 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
263
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
264
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
265 sub print_usage(){
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
266 my $message=shift;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
267 my @usage=qq(
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
268 Version: 2016042201
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
269 Usage: perl Converter_NCBINewFormatData.pl [options]
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
270
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
271
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
272 Options:
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
273
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
274 --inprefix String The prefix of the input files, such as GCF_000007545.1_ASM754v1
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
275 If two or more strains were provided, please join their prefixs with "+"
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
276 Such as GCF_000007545.1_ASM754v1+GCF_000008105.1_ASM810v1+GCF_000711315.1_ASM71131v1
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
277
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
278 --indir String The directory of those input files
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
279
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
280 --outprefix String The prefix for the output files.
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
281 If a value "ty2" was provided, the output files would be: ty2.nuc, ty2.pep, and ty2.function
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
282 If two or more strains were provided, please join their prefixs with "+"
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
283 If the prefix was not provided, the assembly value would be used as the prefix, such as GCF_000007545
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
284
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
285 --outdir String The directory for the output files
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
286
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
287 Note:
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
288 1. Before converting data with this script, please prepare *feature_table.txt, *genomic.fna and *protein.faa files for each strain.
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
289
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
290 2. This script was designed for NCBI new format data only. If part of your data is in the old format, please use the Converter_finished.pl or Converter_draft.pl script to convert the data.
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
291
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
292
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
293 );
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
294
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
295 print join("\n",@usage)."\n";
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
296 print $message."\n";
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
297 exit;
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
298 }
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
299
83e62a1aeeeb Uploaded
dereeper
parents:
diff changeset
300