0
|
1 #!/usr/bin/perl
|
|
2 use strict;
|
|
3 use warnings;
|
|
4 use Getopt::Long;
|
|
5
|
|
6
|
|
7
|
|
8 my %optionHash=qw();
|
|
9 my $inprefix="";
|
|
10 my $outprefix="";
|
|
11 my $indir="";
|
|
12 my $outdir="";
|
|
13 my @inList;
|
|
14 my @outList;
|
|
15
|
|
16 GetOptions(\%optionHash,"inprefix:s","outprefix:s","indir:s","outdir:s","help|h!");
|
|
17
|
|
18
|
|
19 if(scalar(keys %optionHash)==0){
|
|
20 &print_usage("");
|
|
21 }
|
|
22
|
|
23 if(exists($optionHash{"h"}) or exists($optionHash{"help"}) ){
|
|
24 &print_usage("");
|
|
25 }
|
|
26
|
|
27 if( exists($optionHash{"inprefix"} ) ){
|
|
28 $inprefix = $optionHash{"inprefix"};
|
|
29 @inList=split(/\+/,$inprefix);
|
|
30 }else{
|
|
31 &print_usage("--inprefix should be provided!");
|
|
32 }
|
|
33
|
|
34 if( exists($optionHash{"outprefix"} ) ){
|
|
35 $outprefix = $optionHash{"outprefix"};
|
|
36 @outList=split(/\+/,$outprefix);
|
|
37 }
|
|
38
|
|
39 if( exists($optionHash{"indir"} ) ){
|
|
40 $indir = $optionHash{"indir"};
|
|
41 }else{
|
|
42 &print_usage("--indir should be provided!");
|
|
43 }
|
|
44
|
|
45 if( exists($optionHash{"outdir"} ) ){
|
|
46 $outdir = $optionHash{"outdir"};
|
|
47 }else{
|
|
48 &print_usage("--outdir should be provided!");
|
|
49 }
|
|
50
|
|
51
|
|
52 if( $inprefix eq "" or $indir eq "" or $outdir eq "" ){
|
|
53 &print_usage("");
|
|
54 }
|
|
55
|
|
56 if( scalar(@outList) >0 and scalar(@outList) != scalar(@inList) ){
|
|
57 &print_usage("If outprefix was provided, the name number should be identical with inprefix");
|
|
58 }
|
|
59
|
|
60
|
|
61 system("mkdir -p $outdir");
|
|
62
|
|
63 foreach my $idx (0 .. (scalar(@inList) -1) ){
|
|
64 my $inname = $inList[$idx];
|
|
65 my $feature_table = $indir."/".$inname."_feature_table.txt";
|
|
66 my $faa = $indir."/".$inname."_protein.faa";
|
|
67 my $fna = $indir."/".$inname."_genomic.fna";
|
|
68
|
|
69 my %genePositionHash;
|
|
70 my %genomeSequenceHash;
|
|
71 my %geneAnnotationHash;
|
|
72 my %gene2genomeHash;
|
|
73 my %genefaaHash;
|
|
74 my $count=0;
|
|
75
|
|
76 my $outname;
|
|
77
|
|
78 if($outprefix eq ""){
|
|
79 my @tmp=split(/\./,$inname);
|
|
80 $outname = $tmp[0];
|
|
81 }else{
|
|
82 $outname = $outList[$idx];
|
|
83 }
|
|
84
|
|
85 # check file
|
|
86 if( ! -e $feature_table){
|
|
87 print "$feature_table was not found!\n";
|
|
88 print "$inname was skipped!\n";
|
|
89 next;
|
|
90 }
|
|
91
|
|
92 if( ! -e $faa){
|
|
93 print "$faa was not found!\n";
|
|
94 print "$inname was skipped!\n";
|
|
95 next;
|
|
96 }
|
|
97
|
|
98 if( ! -e $fna){
|
|
99 print "$fna was not found!\n";
|
|
100 print "$inname was skipped!\n";
|
|
101 next;
|
|
102 }
|
|
103
|
|
104 # read feature
|
|
105 &read_feature($feature_table,\%geneAnnotationHash,\%genePositionHash,\%gene2genomeHash);
|
|
106
|
|
107 # get genome sequence
|
|
108 &read_genomeSequence($fna,\%genomeSequenceHash);
|
|
109
|
|
110 # get faa
|
|
111 &read_faa($faa,\%genefaaHash);
|
|
112
|
|
113 # extract nuc and output
|
|
114
|
|
115 open(PEP,">$outdir/$outname.pep");
|
|
116 open(NUC,">$outdir/$outname.nuc");
|
|
117 open(FUN,">$outdir/$outname.function");
|
|
118
|
|
119 foreach my $mygene (keys %genePositionHash){
|
|
120 if( (!exists( $geneAnnotationHash{$mygene})) or (!exists($gene2genomeHash{$mygene})) or (!exists($genefaaHash{$mygene})) or (!exists($genomeSequenceHash{$gene2genomeHash{$mygene}})) ){
|
|
121 print $inname . "\t" . $mygene . "\t" . "skipped for insufficient data!\n";
|
|
122 next;
|
|
123 }
|
|
124
|
|
125 my $nuc = &getSeq($genePositionHash{$mygene}, $genomeSequenceHash{$gene2genomeHash{$mygene}} );
|
|
126
|
|
127 if( $nuc eq ""){
|
|
128 print $inname . "\t" . $mygene . "\t" . "skipped for insufficient data!\n";
|
|
129 next;
|
|
130 }
|
|
131
|
|
132 print PEP ">$mygene\n$genefaaHash{$mygene}\n";
|
|
133 print NUC ">$mygene\n$nuc\n";
|
|
134 print FUN "$mygene\t-\t$geneAnnotationHash{$mygene}\n";
|
|
135 $count++;
|
|
136 }
|
|
137 close(PEP);
|
|
138 close(NUC);
|
|
139 close(FUN);
|
|
140
|
|
141 print $inname . " -> $outname: $count genes were extracted in total.\n";
|
|
142 }
|
|
143
|
|
144
|
|
145 sub getSeq(){
|
|
146 (my $pos,my $genomeSeq)=@_;
|
|
147 my @tmp=split(/\|/,$pos);
|
|
148 if( $tmp[1]> length($genomeSeq) ){
|
|
149 return "";
|
|
150 }
|
|
151 my $seq = substr($genomeSeq,$tmp[0]-1,$tmp[1]-$tmp[0]+1);
|
|
152 if($tmp[2] eq "-"){
|
|
153 $seq = &rcseq($seq);
|
|
154 }
|
|
155 return $seq;
|
|
156 }
|
|
157
|
|
158
|
|
159
|
|
160 sub rcseq(){
|
|
161 (my $seq)=@_;
|
|
162 $seq=uc($seq);
|
|
163
|
|
164 $_=$seq;
|
|
165 tr/ACGT/TGCA/;
|
|
166 $seq = $_;
|
|
167 return reverse($_);
|
|
168 }
|
|
169
|
|
170
|
|
171
|
|
172 sub read_faa(){
|
|
173 (my $infile, my $seqHash)=@_;
|
|
174 my $seq="";
|
|
175 my $name="";
|
|
176 my @content;
|
|
177 open(F,$infile);
|
|
178 @content=<F>;
|
|
179 close(F);
|
|
180 chomp(@content);
|
|
181
|
|
182 for (my $line = 0; $line < @content; $line++) {
|
|
183 if($content[$line] =~/^>/ ){
|
|
184 if($name ne ""){
|
|
185 $$seqHash{$name}=$seq;
|
|
186 }
|
|
187 my @tmp=split(/\s+/,$content[$line]);
|
|
188 @tmp=split(/\./,$tmp[0]);
|
|
189
|
|
190 $name=substr($tmp[0], 1, length($tmp[0])-1);
|
|
191 $seq="";
|
|
192 }else{
|
|
193 $seq = $seq . $content[$line];
|
|
194 }
|
|
195 }
|
|
196 $$seqHash{$name}=$seq;
|
|
197 }
|
|
198
|
|
199
|
|
200
|
|
201
|
|
202
|
|
203
|
|
204 sub read_genomeSequence(){
|
|
205 (my $infile,my $seqHash)=@_;
|
|
206 my $seq="";
|
|
207 my $name="";
|
|
208 my @content;
|
|
209 open(F,$infile);
|
|
210 @content=<F>;
|
|
211 close(F);
|
|
212 chomp(@content);
|
|
213
|
|
214 for (my $line = 0; $line < @content; $line++) {
|
|
215 if($content[$line] =~/^>/ ){
|
|
216 if($name ne ""){
|
|
217 $$seqHash{$name}=$seq;
|
|
218 }
|
|
219 my @tmp=split(/\s+/,$content[$line]);
|
|
220 $name=substr($tmp[0], 1, length($tmp[0])-1);
|
|
221 $seq="";
|
|
222 }else{
|
|
223 $seq = $seq . $content[$line];
|
|
224 }
|
|
225 }
|
|
226 $$seqHash{$name}=$seq;
|
|
227 }
|
|
228
|
|
229
|
|
230
|
|
231 sub read_feature(){
|
|
232 (my $infile, my $annHash,my $posHash,my $gene2genomeHash)=@_;
|
|
233 my @content;
|
|
234 open(F,$infile);
|
|
235 @content=<F>;
|
|
236 close(F);
|
|
237 chomp(@content);
|
|
238
|
|
239 for (my $line = 1; $line < @content - 1; $line=$line+1) {
|
|
240 my @row=split(/\t/,$content[$line]);
|
|
241 my @nextrow=split(/\t/,$content[$line+1]);
|
|
242
|
|
243 if($row[1] ne "protein_coding"){
|
|
244 next;
|
|
245 }
|
|
246
|
|
247 # in case
|
|
248 if( $row[7]."|".$row[8]."|".$row[9] ne $nextrow[7]."|".$nextrow[8]."|".$nextrow[9]){
|
|
249 next;
|
|
250 }
|
|
251
|
|
252 # print $nextrow[10]."\n";
|
|
253 # print $row[7]."|".$row[8]."|".$row[9]."\n";
|
|
254
|
|
255 my @tmp=split(/\./,$nextrow[10]);
|
|
256 my $geneName = $tmp[0];
|
|
257
|
|
258 $$annHash{$geneName}=$nextrow[13];
|
|
259 $$posHash{$geneName}=$row[7]."|".$row[8]."|".$row[9];
|
|
260 $$gene2genomeHash{$geneName}=$row[6];
|
|
261 }
|
|
262 }
|
|
263
|
|
264
|
|
265 sub print_usage(){
|
|
266 my $message=shift;
|
|
267 my @usage=qq(
|
|
268 Version: 2016042201
|
|
269 Usage: perl Converter_NCBINewFormatData.pl [options]
|
|
270
|
|
271
|
|
272 Options:
|
|
273
|
|
274 --inprefix String The prefix of the input files, such as GCF_000007545.1_ASM754v1
|
|
275 If two or more strains were provided, please join their prefixs with "+"
|
|
276 Such as GCF_000007545.1_ASM754v1+GCF_000008105.1_ASM810v1+GCF_000711315.1_ASM71131v1
|
|
277
|
|
278 --indir String The directory of those input files
|
|
279
|
|
280 --outprefix String The prefix for the output files.
|
|
281 If a value "ty2" was provided, the output files would be: ty2.nuc, ty2.pep, and ty2.function
|
|
282 If two or more strains were provided, please join their prefixs with "+"
|
|
283 If the prefix was not provided, the assembly value would be used as the prefix, such as GCF_000007545
|
|
284
|
|
285 --outdir String The directory for the output files
|
|
286
|
|
287 Note:
|
|
288 1. Before converting data with this script, please prepare *feature_table.txt, *genomic.fna and *protein.faa files for each strain.
|
|
289
|
|
290 2. This script was designed for NCBI new format data only. If part of your data is in the old format, please use the Converter_finished.pl or Converter_draft.pl script to convert the data.
|
|
291
|
|
292
|
|
293 );
|
|
294
|
|
295 print join("\n",@usage)."\n";
|
|
296 print $message."\n";
|
|
297 exit;
|
|
298 }
|
|
299
|
|
300
|