0
|
1 package Bio::Roary::PrepareInputFiles;
|
|
2
|
|
3 # ABSTRACT: Take in a mixture of FASTA and GFF input files and output FASTA proteomes only
|
|
4
|
|
5 =head1 SYNOPSIS
|
|
6
|
|
7 Take in a mixture of FASTA and GFF input files and output FASTA proteomes only
|
|
8 use Bio::Roary::PrepareInputFiles;
|
|
9
|
|
10 my $obj = Bio::Roary::PrepareInputFiles->new(
|
|
11 input_files => ['abc.gff','ddd.faa'],
|
|
12 );
|
|
13 $obj->fasta_files;
|
|
14
|
|
15 =cut
|
|
16
|
|
17 use Moose;
|
|
18 use Bio::Roary::Exceptions;
|
|
19 use Bio::Roary::ExtractProteomeFromGFFs;
|
|
20 use Bio::Roary::FilterUnknownsFromFasta;
|
|
21 use Cwd qw(getcwd);
|
|
22 use File::Temp;
|
|
23 use Log::Log4perl qw(:easy);
|
|
24
|
|
25 has 'input_files' => ( is => 'ro', isa => 'ArrayRef', required => 1 );
|
|
26 has 'job_runner' => ( is => 'ro', isa => 'Str', default => 'Local' );
|
|
27 has 'cpus' => ( is => 'ro', isa => 'Int', default => 1 );
|
|
28 has '_input_gff_files' => ( is => 'ro', isa => 'Maybe[ArrayRef]', lazy => 1, builder => '_build__input_gff_files' );
|
|
29 has '_input_fasta_files' => ( is => 'ro', isa => 'Maybe[ArrayRef]', lazy => 1, builder => '_build__input_fasta_files' );
|
|
30 has '_input_fasta_files_filtered' => ( is => 'ro', isa => 'Maybe[ArrayRef]', lazy => 1, builder => '_build__input_fasta_files_filtered' );
|
|
31 has '_input_fasta_files_filtered_obj' =>
|
|
32 ( is => 'ro', isa => 'Bio::Roary::FilterUnknownsFromFasta', lazy => 1, builder => '_build__input_fasta_files_filtered_obj' );
|
|
33
|
|
34 has '_derived_fasta_files' => ( is => 'ro', isa => 'Maybe[ArrayRef]', lazy => 1, builder => '_build__derived_fasta_files' );
|
|
35 has '_extract_proteome_obj' => (
|
|
36 is => 'ro',
|
|
37 isa => 'Bio::Roary::ExtractProteomeFromGFFs',
|
|
38 lazy => 1,
|
|
39 builder => '_build__extract_proteome_obj'
|
|
40 );
|
|
41 has 'apply_unknowns_filter' => ( is => 'rw', isa => 'Bool', default => 1 );
|
|
42 has 'translation_table' => ( is => 'rw', isa => 'Int', default => 11 );
|
|
43 has 'verbose' => ( is => 'rw', isa => 'Bool', default => 0 );
|
|
44 has '_fasta_filter_obj' => ( is => 'ro', isa => 'Bio::Roary::FilterUnknowsFromFasta', lazy => 1, builder => '_fasta_filter_obj' );
|
|
45 has 'working_directory' => ( is => 'ro', isa => 'File::Temp::Dir', default => sub { File::Temp->newdir( DIR => getcwd, CLEANUP => 1 ); } );
|
|
46 has 'logger' => ( is => 'ro', lazy => 1, builder => '_build_logger' );
|
|
47
|
|
48 sub _build_logger {
|
|
49 my ($self) = @_;
|
|
50 Log::Log4perl->easy_init($ERROR);
|
|
51 my $logger = get_logger();
|
|
52 return $logger;
|
|
53 }
|
|
54
|
|
55 sub _build__input_gff_files {
|
|
56 my ($self) = @_;
|
|
57 my @gff_files = grep( /\.gff$/, @{ $self->input_files } );
|
|
58 return \@gff_files;
|
|
59 }
|
|
60
|
|
61 sub _build__input_fasta_files {
|
|
62 my ($self) = @_;
|
|
63 my @fasta_files = grep( !/\.gff$/, @{ $self->input_files } );
|
|
64
|
|
65 my @validated_fasta_files;
|
|
66
|
|
67 for my $fasta_file (@fasta_files) {
|
|
68 eval {
|
|
69 my $inseq = Bio::SeqIO->new(
|
|
70 -file => $fasta_file,
|
|
71 -format => 'fasta',
|
|
72 -alphabet => 'protein'
|
|
73 );
|
|
74 while ( my $seq = $inseq->next_seq ) {
|
|
75
|
|
76 # do something to force the reading.
|
|
77 $seq->seq;
|
|
78 }
|
|
79 };
|
|
80 if ($@) {
|
|
81 $self->logger->warn(
|
|
82 "Input file doesnt have a .gff extension and isnt a protein FASTA file so excluding it from further analysis: $fasta_file"
|
|
83 );
|
|
84 }
|
|
85 else {
|
|
86 push( @validated_fasta_files, $fasta_file );
|
|
87 }
|
|
88
|
|
89 }
|
|
90
|
|
91 return \@fasta_files;
|
|
92 }
|
|
93
|
|
94 sub _build__input_fasta_files_filtered_obj {
|
|
95 my ($self) = @_;
|
|
96 return Bio::Roary::FilterUnknownsFromFasta->new( fasta_files => $self->_input_fasta_files );
|
|
97 }
|
|
98
|
|
99 sub _build__input_fasta_files_filtered {
|
|
100 my ($self) = @_;
|
|
101 return undef if ( !defined( $self->_input_fasta_files ) );
|
|
102 return $self->_input_fasta_files_filtered_obj->filtered_fasta_files();
|
|
103 }
|
|
104
|
|
105 sub _build__extract_proteome_obj {
|
|
106 my ($self) = @_;
|
|
107 return Bio::Roary::ExtractProteomeFromGFFs->new(
|
|
108 gff_files => $self->_input_gff_files,
|
|
109 job_runner => $self->job_runner,
|
|
110 apply_unknowns_filter => $self->apply_unknowns_filter,
|
|
111 translation_table => $self->translation_table,
|
|
112 cpus => $self->cpus,
|
|
113 verbose => $self->verbose,
|
|
114 working_directory => $self->working_directory,
|
|
115 );
|
|
116 }
|
|
117
|
|
118 sub _build__derived_fasta_files {
|
|
119 my ($self) = @_;
|
|
120 return undef if ( !defined( $self->_input_gff_files ) );
|
|
121 return $self->_extract_proteome_obj->fasta_files();
|
|
122 }
|
|
123
|
|
124 sub fasta_files {
|
|
125 my ($self) = @_;
|
|
126 my @output_fasta_files = ( @{ $self->_input_fasta_files_filtered }, @{ $self->_derived_fasta_files } );
|
|
127 return \@output_fasta_files;
|
|
128 }
|
|
129
|
|
130 sub lookup_fasta_files_from_unknown_input_files {
|
|
131 my ( $self, $input_files ) = @_;
|
|
132 $self->fasta_files;
|
|
133
|
|
134 my @output_fasta_files;
|
|
135 for my $input_file ( @{$input_files} ) {
|
|
136 if ( defined( $self->_extract_proteome_obj->fasta_files_to_gff_files->{$input_file} ) ) {
|
|
137 push( @output_fasta_files, $self->_extract_proteome_obj->fasta_files_to_gff_files->{$input_file} );
|
|
138 }
|
|
139 else {
|
|
140 push( @output_fasta_files, $self->_input_fasta_files_filtered_obj->input_fasta_to_output_fasta->{$input_file} );
|
|
141 }
|
|
142 }
|
|
143 return \@output_fasta_files;
|
|
144 }
|
|
145
|
|
146 no Moose;
|
|
147 __PACKAGE__->meta->make_immutable;
|
|
148
|
|
149 1;
|