comparison PGAP-1.2.1/Converter_NCBINewFormatData.pl @ 0:83e62a1aeeeb draft

Uploaded
author dereeper
date Thu, 24 Jun 2021 13:51:52 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:83e62a1aeeeb
1 #!/usr/bin/perl
2 use strict;
3 use warnings;
4 use Getopt::Long;
5
6
7
8 my %optionHash=qw();
9 my $inprefix="";
10 my $outprefix="";
11 my $indir="";
12 my $outdir="";
13 my @inList;
14 my @outList;
15
16 GetOptions(\%optionHash,"inprefix:s","outprefix:s","indir:s","outdir:s","help|h!");
17
18
19 if(scalar(keys %optionHash)==0){
20 &print_usage("");
21 }
22
23 if(exists($optionHash{"h"}) or exists($optionHash{"help"}) ){
24 &print_usage("");
25 }
26
27 if( exists($optionHash{"inprefix"} ) ){
28 $inprefix = $optionHash{"inprefix"};
29 @inList=split(/\+/,$inprefix);
30 }else{
31 &print_usage("--inprefix should be provided!");
32 }
33
34 if( exists($optionHash{"outprefix"} ) ){
35 $outprefix = $optionHash{"outprefix"};
36 @outList=split(/\+/,$outprefix);
37 }
38
39 if( exists($optionHash{"indir"} ) ){
40 $indir = $optionHash{"indir"};
41 }else{
42 &print_usage("--indir should be provided!");
43 }
44
45 if( exists($optionHash{"outdir"} ) ){
46 $outdir = $optionHash{"outdir"};
47 }else{
48 &print_usage("--outdir should be provided!");
49 }
50
51
52 if( $inprefix eq "" or $indir eq "" or $outdir eq "" ){
53 &print_usage("");
54 }
55
56 if( scalar(@outList) >0 and scalar(@outList) != scalar(@inList) ){
57 &print_usage("If outprefix was provided, the name number should be identical with inprefix");
58 }
59
60
61 system("mkdir -p $outdir");
62
63 foreach my $idx (0 .. (scalar(@inList) -1) ){
64 my $inname = $inList[$idx];
65 my $feature_table = $indir."/".$inname."_feature_table.txt";
66 my $faa = $indir."/".$inname."_protein.faa";
67 my $fna = $indir."/".$inname."_genomic.fna";
68
69 my %genePositionHash;
70 my %genomeSequenceHash;
71 my %geneAnnotationHash;
72 my %gene2genomeHash;
73 my %genefaaHash;
74 my $count=0;
75
76 my $outname;
77
78 if($outprefix eq ""){
79 my @tmp=split(/\./,$inname);
80 $outname = $tmp[0];
81 }else{
82 $outname = $outList[$idx];
83 }
84
85 # check file
86 if( ! -e $feature_table){
87 print "$feature_table was not found!\n";
88 print "$inname was skipped!\n";
89 next;
90 }
91
92 if( ! -e $faa){
93 print "$faa was not found!\n";
94 print "$inname was skipped!\n";
95 next;
96 }
97
98 if( ! -e $fna){
99 print "$fna was not found!\n";
100 print "$inname was skipped!\n";
101 next;
102 }
103
104 # read feature
105 &read_feature($feature_table,\%geneAnnotationHash,\%genePositionHash,\%gene2genomeHash);
106
107 # get genome sequence
108 &read_genomeSequence($fna,\%genomeSequenceHash);
109
110 # get faa
111 &read_faa($faa,\%genefaaHash);
112
113 # extract nuc and output
114
115 open(PEP,">$outdir/$outname.pep");
116 open(NUC,">$outdir/$outname.nuc");
117 open(FUN,">$outdir/$outname.function");
118
119 foreach my $mygene (keys %genePositionHash){
120 if( (!exists( $geneAnnotationHash{$mygene})) or (!exists($gene2genomeHash{$mygene})) or (!exists($genefaaHash{$mygene})) or (!exists($genomeSequenceHash{$gene2genomeHash{$mygene}})) ){
121 print $inname . "\t" . $mygene . "\t" . "skipped for insufficient data!\n";
122 next;
123 }
124
125 my $nuc = &getSeq($genePositionHash{$mygene}, $genomeSequenceHash{$gene2genomeHash{$mygene}} );
126
127 if( $nuc eq ""){
128 print $inname . "\t" . $mygene . "\t" . "skipped for insufficient data!\n";
129 next;
130 }
131
132 print PEP ">$mygene\n$genefaaHash{$mygene}\n";
133 print NUC ">$mygene\n$nuc\n";
134 print FUN "$mygene\t-\t$geneAnnotationHash{$mygene}\n";
135 $count++;
136 }
137 close(PEP);
138 close(NUC);
139 close(FUN);
140
141 print $inname . " -> $outname: $count genes were extracted in total.\n";
142 }
143
144
145 sub getSeq(){
146 (my $pos,my $genomeSeq)=@_;
147 my @tmp=split(/\|/,$pos);
148 if( $tmp[1]> length($genomeSeq) ){
149 return "";
150 }
151 my $seq = substr($genomeSeq,$tmp[0]-1,$tmp[1]-$tmp[0]+1);
152 if($tmp[2] eq "-"){
153 $seq = &rcseq($seq);
154 }
155 return $seq;
156 }
157
158
159
160 sub rcseq(){
161 (my $seq)=@_;
162 $seq=uc($seq);
163
164 $_=$seq;
165 tr/ACGT/TGCA/;
166 $seq = $_;
167 return reverse($_);
168 }
169
170
171
172 sub read_faa(){
173 (my $infile, my $seqHash)=@_;
174 my $seq="";
175 my $name="";
176 my @content;
177 open(F,$infile);
178 @content=<F>;
179 close(F);
180 chomp(@content);
181
182 for (my $line = 0; $line < @content; $line++) {
183 if($content[$line] =~/^>/ ){
184 if($name ne ""){
185 $$seqHash{$name}=$seq;
186 }
187 my @tmp=split(/\s+/,$content[$line]);
188 @tmp=split(/\./,$tmp[0]);
189
190 $name=substr($tmp[0], 1, length($tmp[0])-1);
191 $seq="";
192 }else{
193 $seq = $seq . $content[$line];
194 }
195 }
196 $$seqHash{$name}=$seq;
197 }
198
199
200
201
202
203
204 sub read_genomeSequence(){
205 (my $infile,my $seqHash)=@_;
206 my $seq="";
207 my $name="";
208 my @content;
209 open(F,$infile);
210 @content=<F>;
211 close(F);
212 chomp(@content);
213
214 for (my $line = 0; $line < @content; $line++) {
215 if($content[$line] =~/^>/ ){
216 if($name ne ""){
217 $$seqHash{$name}=$seq;
218 }
219 my @tmp=split(/\s+/,$content[$line]);
220 $name=substr($tmp[0], 1, length($tmp[0])-1);
221 $seq="";
222 }else{
223 $seq = $seq . $content[$line];
224 }
225 }
226 $$seqHash{$name}=$seq;
227 }
228
229
230
231 sub read_feature(){
232 (my $infile, my $annHash,my $posHash,my $gene2genomeHash)=@_;
233 my @content;
234 open(F,$infile);
235 @content=<F>;
236 close(F);
237 chomp(@content);
238
239 for (my $line = 1; $line < @content - 1; $line=$line+1) {
240 my @row=split(/\t/,$content[$line]);
241 my @nextrow=split(/\t/,$content[$line+1]);
242
243 if($row[1] ne "protein_coding"){
244 next;
245 }
246
247 # in case
248 if( $row[7]."|".$row[8]."|".$row[9] ne $nextrow[7]."|".$nextrow[8]."|".$nextrow[9]){
249 next;
250 }
251
252 # print $nextrow[10]."\n";
253 # print $row[7]."|".$row[8]."|".$row[9]."\n";
254
255 my @tmp=split(/\./,$nextrow[10]);
256 my $geneName = $tmp[0];
257
258 $$annHash{$geneName}=$nextrow[13];
259 $$posHash{$geneName}=$row[7]."|".$row[8]."|".$row[9];
260 $$gene2genomeHash{$geneName}=$row[6];
261 }
262 }
263
264
265 sub print_usage(){
266 my $message=shift;
267 my @usage=qq(
268 Version: 2016042201
269 Usage: perl Converter_NCBINewFormatData.pl [options]
270
271
272 Options:
273
274 --inprefix String The prefix of the input files, such as GCF_000007545.1_ASM754v1
275 If two or more strains were provided, please join their prefixs with "+"
276 Such as GCF_000007545.1_ASM754v1+GCF_000008105.1_ASM810v1+GCF_000711315.1_ASM71131v1
277
278 --indir String The directory of those input files
279
280 --outprefix String The prefix for the output files.
281 If a value "ty2" was provided, the output files would be: ty2.nuc, ty2.pep, and ty2.function
282 If two or more strains were provided, please join their prefixs with "+"
283 If the prefix was not provided, the assembly value would be used as the prefix, such as GCF_000007545
284
285 --outdir String The directory for the output files
286
287 Note:
288 1. Before converting data with this script, please prepare *feature_table.txt, *genomic.fna and *protein.faa files for each strain.
289
290 2. This script was designed for NCBI new format data only. If part of your data is in the old format, please use the Converter_finished.pl or Converter_draft.pl script to convert the data.
291
292
293 );
294
295 print join("\n",@usage)."\n";
296 print $message."\n";
297 exit;
298 }
299
300