Mercurial > repos > dereeper > pgap
comparison PGAP-1.2.1/Converter_NCBINewFormatData.pl @ 0:83e62a1aeeeb draft
Uploaded
author | dereeper |
---|---|
date | Thu, 24 Jun 2021 13:51:52 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:83e62a1aeeeb |
---|---|
1 #!/usr/bin/perl | |
2 use strict; | |
3 use warnings; | |
4 use Getopt::Long; | |
5 | |
6 | |
7 | |
8 my %optionHash=qw(); | |
9 my $inprefix=""; | |
10 my $outprefix=""; | |
11 my $indir=""; | |
12 my $outdir=""; | |
13 my @inList; | |
14 my @outList; | |
15 | |
16 GetOptions(\%optionHash,"inprefix:s","outprefix:s","indir:s","outdir:s","help|h!"); | |
17 | |
18 | |
19 if(scalar(keys %optionHash)==0){ | |
20 &print_usage(""); | |
21 } | |
22 | |
23 if(exists($optionHash{"h"}) or exists($optionHash{"help"}) ){ | |
24 &print_usage(""); | |
25 } | |
26 | |
27 if( exists($optionHash{"inprefix"} ) ){ | |
28 $inprefix = $optionHash{"inprefix"}; | |
29 @inList=split(/\+/,$inprefix); | |
30 }else{ | |
31 &print_usage("--inprefix should be provided!"); | |
32 } | |
33 | |
34 if( exists($optionHash{"outprefix"} ) ){ | |
35 $outprefix = $optionHash{"outprefix"}; | |
36 @outList=split(/\+/,$outprefix); | |
37 } | |
38 | |
39 if( exists($optionHash{"indir"} ) ){ | |
40 $indir = $optionHash{"indir"}; | |
41 }else{ | |
42 &print_usage("--indir should be provided!"); | |
43 } | |
44 | |
45 if( exists($optionHash{"outdir"} ) ){ | |
46 $outdir = $optionHash{"outdir"}; | |
47 }else{ | |
48 &print_usage("--outdir should be provided!"); | |
49 } | |
50 | |
51 | |
52 if( $inprefix eq "" or $indir eq "" or $outdir eq "" ){ | |
53 &print_usage(""); | |
54 } | |
55 | |
56 if( scalar(@outList) >0 and scalar(@outList) != scalar(@inList) ){ | |
57 &print_usage("If outprefix was provided, the name number should be identical with inprefix"); | |
58 } | |
59 | |
60 | |
61 system("mkdir -p $outdir"); | |
62 | |
63 foreach my $idx (0 .. (scalar(@inList) -1) ){ | |
64 my $inname = $inList[$idx]; | |
65 my $feature_table = $indir."/".$inname."_feature_table.txt"; | |
66 my $faa = $indir."/".$inname."_protein.faa"; | |
67 my $fna = $indir."/".$inname."_genomic.fna"; | |
68 | |
69 my %genePositionHash; | |
70 my %genomeSequenceHash; | |
71 my %geneAnnotationHash; | |
72 my %gene2genomeHash; | |
73 my %genefaaHash; | |
74 my $count=0; | |
75 | |
76 my $outname; | |
77 | |
78 if($outprefix eq ""){ | |
79 my @tmp=split(/\./,$inname); | |
80 $outname = $tmp[0]; | |
81 }else{ | |
82 $outname = $outList[$idx]; | |
83 } | |
84 | |
85 # check file | |
86 if( ! -e $feature_table){ | |
87 print "$feature_table was not found!\n"; | |
88 print "$inname was skipped!\n"; | |
89 next; | |
90 } | |
91 | |
92 if( ! -e $faa){ | |
93 print "$faa was not found!\n"; | |
94 print "$inname was skipped!\n"; | |
95 next; | |
96 } | |
97 | |
98 if( ! -e $fna){ | |
99 print "$fna was not found!\n"; | |
100 print "$inname was skipped!\n"; | |
101 next; | |
102 } | |
103 | |
104 # read feature | |
105 &read_feature($feature_table,\%geneAnnotationHash,\%genePositionHash,\%gene2genomeHash); | |
106 | |
107 # get genome sequence | |
108 &read_genomeSequence($fna,\%genomeSequenceHash); | |
109 | |
110 # get faa | |
111 &read_faa($faa,\%genefaaHash); | |
112 | |
113 # extract nuc and output | |
114 | |
115 open(PEP,">$outdir/$outname.pep"); | |
116 open(NUC,">$outdir/$outname.nuc"); | |
117 open(FUN,">$outdir/$outname.function"); | |
118 | |
119 foreach my $mygene (keys %genePositionHash){ | |
120 if( (!exists( $geneAnnotationHash{$mygene})) or (!exists($gene2genomeHash{$mygene})) or (!exists($genefaaHash{$mygene})) or (!exists($genomeSequenceHash{$gene2genomeHash{$mygene}})) ){ | |
121 print $inname . "\t" . $mygene . "\t" . "skipped for insufficient data!\n"; | |
122 next; | |
123 } | |
124 | |
125 my $nuc = &getSeq($genePositionHash{$mygene}, $genomeSequenceHash{$gene2genomeHash{$mygene}} ); | |
126 | |
127 if( $nuc eq ""){ | |
128 print $inname . "\t" . $mygene . "\t" . "skipped for insufficient data!\n"; | |
129 next; | |
130 } | |
131 | |
132 print PEP ">$mygene\n$genefaaHash{$mygene}\n"; | |
133 print NUC ">$mygene\n$nuc\n"; | |
134 print FUN "$mygene\t-\t$geneAnnotationHash{$mygene}\n"; | |
135 $count++; | |
136 } | |
137 close(PEP); | |
138 close(NUC); | |
139 close(FUN); | |
140 | |
141 print $inname . " -> $outname: $count genes were extracted in total.\n"; | |
142 } | |
143 | |
144 | |
145 sub getSeq(){ | |
146 (my $pos,my $genomeSeq)=@_; | |
147 my @tmp=split(/\|/,$pos); | |
148 if( $tmp[1]> length($genomeSeq) ){ | |
149 return ""; | |
150 } | |
151 my $seq = substr($genomeSeq,$tmp[0]-1,$tmp[1]-$tmp[0]+1); | |
152 if($tmp[2] eq "-"){ | |
153 $seq = &rcseq($seq); | |
154 } | |
155 return $seq; | |
156 } | |
157 | |
158 | |
159 | |
160 sub rcseq(){ | |
161 (my $seq)=@_; | |
162 $seq=uc($seq); | |
163 | |
164 $_=$seq; | |
165 tr/ACGT/TGCA/; | |
166 $seq = $_; | |
167 return reverse($_); | |
168 } | |
169 | |
170 | |
171 | |
172 sub read_faa(){ | |
173 (my $infile, my $seqHash)=@_; | |
174 my $seq=""; | |
175 my $name=""; | |
176 my @content; | |
177 open(F,$infile); | |
178 @content=<F>; | |
179 close(F); | |
180 chomp(@content); | |
181 | |
182 for (my $line = 0; $line < @content; $line++) { | |
183 if($content[$line] =~/^>/ ){ | |
184 if($name ne ""){ | |
185 $$seqHash{$name}=$seq; | |
186 } | |
187 my @tmp=split(/\s+/,$content[$line]); | |
188 @tmp=split(/\./,$tmp[0]); | |
189 | |
190 $name=substr($tmp[0], 1, length($tmp[0])-1); | |
191 $seq=""; | |
192 }else{ | |
193 $seq = $seq . $content[$line]; | |
194 } | |
195 } | |
196 $$seqHash{$name}=$seq; | |
197 } | |
198 | |
199 | |
200 | |
201 | |
202 | |
203 | |
204 sub read_genomeSequence(){ | |
205 (my $infile,my $seqHash)=@_; | |
206 my $seq=""; | |
207 my $name=""; | |
208 my @content; | |
209 open(F,$infile); | |
210 @content=<F>; | |
211 close(F); | |
212 chomp(@content); | |
213 | |
214 for (my $line = 0; $line < @content; $line++) { | |
215 if($content[$line] =~/^>/ ){ | |
216 if($name ne ""){ | |
217 $$seqHash{$name}=$seq; | |
218 } | |
219 my @tmp=split(/\s+/,$content[$line]); | |
220 $name=substr($tmp[0], 1, length($tmp[0])-1); | |
221 $seq=""; | |
222 }else{ | |
223 $seq = $seq . $content[$line]; | |
224 } | |
225 } | |
226 $$seqHash{$name}=$seq; | |
227 } | |
228 | |
229 | |
230 | |
231 sub read_feature(){ | |
232 (my $infile, my $annHash,my $posHash,my $gene2genomeHash)=@_; | |
233 my @content; | |
234 open(F,$infile); | |
235 @content=<F>; | |
236 close(F); | |
237 chomp(@content); | |
238 | |
239 for (my $line = 1; $line < @content - 1; $line=$line+1) { | |
240 my @row=split(/\t/,$content[$line]); | |
241 my @nextrow=split(/\t/,$content[$line+1]); | |
242 | |
243 if($row[1] ne "protein_coding"){ | |
244 next; | |
245 } | |
246 | |
247 # in case | |
248 if( $row[7]."|".$row[8]."|".$row[9] ne $nextrow[7]."|".$nextrow[8]."|".$nextrow[9]){ | |
249 next; | |
250 } | |
251 | |
252 # print $nextrow[10]."\n"; | |
253 # print $row[7]."|".$row[8]."|".$row[9]."\n"; | |
254 | |
255 my @tmp=split(/\./,$nextrow[10]); | |
256 my $geneName = $tmp[0]; | |
257 | |
258 $$annHash{$geneName}=$nextrow[13]; | |
259 $$posHash{$geneName}=$row[7]."|".$row[8]."|".$row[9]; | |
260 $$gene2genomeHash{$geneName}=$row[6]; | |
261 } | |
262 } | |
263 | |
264 | |
265 sub print_usage(){ | |
266 my $message=shift; | |
267 my @usage=qq( | |
268 Version: 2016042201 | |
269 Usage: perl Converter_NCBINewFormatData.pl [options] | |
270 | |
271 | |
272 Options: | |
273 | |
274 --inprefix String The prefix of the input files, such as GCF_000007545.1_ASM754v1 | |
275 If two or more strains were provided, please join their prefixs with "+" | |
276 Such as GCF_000007545.1_ASM754v1+GCF_000008105.1_ASM810v1+GCF_000711315.1_ASM71131v1 | |
277 | |
278 --indir String The directory of those input files | |
279 | |
280 --outprefix String The prefix for the output files. | |
281 If a value "ty2" was provided, the output files would be: ty2.nuc, ty2.pep, and ty2.function | |
282 If two or more strains were provided, please join their prefixs with "+" | |
283 If the prefix was not provided, the assembly value would be used as the prefix, such as GCF_000007545 | |
284 | |
285 --outdir String The directory for the output files | |
286 | |
287 Note: | |
288 1. Before converting data with this script, please prepare *feature_table.txt, *genomic.fna and *protein.faa files for each strain. | |
289 | |
290 2. This script was designed for NCBI new format data only. If part of your data is in the old format, please use the Converter_finished.pl or Converter_draft.pl script to convert the data. | |
291 | |
292 | |
293 ); | |
294 | |
295 print join("\n",@usage)."\n"; | |
296 print $message."\n"; | |
297 exit; | |
298 } | |
299 | |
300 |