diff COG/bac-genomics-scripts/revcom_seq/revcom_seq.pl @ 3:e42d30da7a74 draft

Uploaded
author dereeper
date Thu, 30 May 2024 11:52:25 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/COG/bac-genomics-scripts/revcom_seq/revcom_seq.pl	Thu May 30 11:52:25 2024 +0000
@@ -0,0 +1,207 @@
+#!/usr/bin/perl
+
+#######
+# POD #
+#######
+
+=pod
+
+=head1 NAME
+
+C<revcom_seq.pl> - reverse complement (multi-)sequence files
+
+=head1 SYNOPSIS
+
+C<perl revcom_seq.pl seq-file.embl E<gt> seq-file_revcom.embl>
+
+B<or>
+
+C<perl cat_seq.pl multi-seq_file.embl | perl revcom_seq.pl -i embl
+E<gt> seq_file_cat_revcom.embl>
+
+=head1 DESCRIPTION
+
+This script reverse complements (multi-)sequence files. The
+features/annotations in RichSeq files (e.g. EMBL or GENBANK format)
+will also be adapted accordingly. Use option B<-o> to specify a
+different output sequence format. Input files can be given directly via
+C<STDIN> or as a file. If C<STDIN> is used, the input sequence file
+format has to be given with option B<-i>. Be careful to set the correct
+input format.
+
+=head1 OPTIONS
+
+=over 20
+
+=item B<-h>, B<-help>
+
+Help (perldoc POD)
+
+=item B<-o>=I<str>, B<-outformat>=I<str>
+
+Specify different sequence format for the output [fasta, embl, or gbk]
+
+=item B<-i>=I<str>, B<-informat>=I<str>
+
+Specify the input sequence file format, only needed for C<STDIN> input
+
+=item B<-v>, B<-version>
+
+Print version number to C<STDOUT>
+
+=back
+
+=head1 OUTPUT
+
+=over 20
+
+=item C<STDOUT>
+
+The reverse complemented sequence file is printed to C<STDOUT>.
+Redirect or pipe into another tool as needed.
+
+=back
+
+=head1 EXAMPLES
+
+=over
+
+=item C<perl revcom_seq.pl -o gbk seq-file.embl E<gt>
+seq-file_revcom.gbk>
+
+=back
+
+B<or>
+
+=over
+
+=item C<for file in *.embl; do perl revcom_seq.pl -o fasta "$file"
+E<gt> "${file%.embl}"_revcom.fasta; done>
+
+=back
+
+=head1 DEPENDENCIES
+
+=over
+
+=item B<L<BioPerl|http://www.bioperl.org>>
+
+Tested with BioPerl version 1.007001
+
+=back
+
+=head1 VERSION
+
+ 0.2                                               update: 2015-12-10
+ 0.1                                                       2013-08-02
+
+=head1 AUTHOR
+
+ Andreas Leimbach                               aleimba[at]gmx[dot]de
+
+=head1 LICENSE
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3 (GPLv3) of the
+License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program. If not, see L<http://www.gnu.org/licenses/>.
+
+=cut
+
+
+########
+# MAIN #
+########
+
+use strict;
+use warnings;
+use autodie;
+use Getopt::Long;
+use Pod::Usage;
+use Bio::SeqIO; # bioperl module to handle sequence input/output
+#use Bio::Seq; # bioperl module to handle sequences with features ### apparently not needed, methods inherited
+#use Bio::SeqUtils; # bioperl module with additional methods (including features) for Bio::Seq objects ### apparently not needed, methods inherited
+
+### Get options with Getopt::Long
+my $In_Format; # input seq file format needed for STDIN
+my $Out_Format; # optional different output seq file format
+my $VERSION = 0.2;
+my ($Opt_Version, $Opt_Help);
+GetOptions ('informat=s' => \$In_Format,
+            'outformat=s' => \$Out_Format,
+            'version' => \$Opt_Version,
+            'help|?' => \$Opt_Help)
+            or pod2usage(-verbose => 1, -exitval => 2);
+
+
+
+### Run perldoc on POD
+pod2usage(-verbose => 2) if ($Opt_Help);
+if ($Opt_Version) {
+    print "$0 $VERSION\n";
+    exit;
+}
+
+
+
+### Check input (@ARGV and STDIN)
+if (-t STDIN && ! @ARGV) {
+    my $warning = "\n### Fatal error: No STDIN and no input file given as argument, please supply one of them and/or see help with '-h'!\n";
+    pod2usage(-verbose => 0, -message => $warning, -exitval => 2);
+} elsif (!-t STDIN && @ARGV) {
+    my $warning = "\n### Fatal error: Both STDIN and an input file given as argument, please supply only either one and/or see help with '-h'!\n";
+    pod2usage(-verbose => 0, -message => $warning, -exitval => 2);
+}
+die "\n### Fatal error: Too many arguments given, only STDIN or one input file allowed as argument! Please see the usage with option '-h' if unclear!\n" if (@ARGV > 1);
+die "\n### Fatal error: File '$ARGV[0]' does not exist!\n" if (@ARGV && $ARGV[0] ne '-' && !-e $ARGV[0]);
+
+
+
+### Bio::SeqIO objects for input and output
+print STDERR "\nReverse complementing";
+my $Seqin; # Bio::SeqIO object
+if (-t STDIN) { # input from file
+    warn "\n### Warning: Ignoring input file format ('-i $In_Format'), because input file given and not STDIN!\n\n" if ($In_Format);
+    my $seq_file = shift;
+    $Seqin = Bio::SeqIO->new(-file => "<$seq_file"); # Bio::SeqIO object; no '-format' given, leave it to bioperl guessing
+    print STDERR " '$seq_file' ";
+} elsif (!-t STDIN) { # input from STDIN
+    die "\n### Fatal error: Sequence file given as STDIN requires an input file format, please set one with option '-i' and/or see help with '-h'!\n" if (!$In_Format);
+    $In_Format = 'genbank' if ($In_Format =~ /(gbk|gb)/i); # allow shorter format string for 'genbank'
+    $Seqin = Bio::SeqIO->new(-fh => \*STDIN, -format => $In_Format); # capture typeglob of STDIN, requires '-format'
+    print STDERR " input file ";
+}
+print STDERR "...\n";
+
+my $Seqout; # Bio::SeqIO object
+if ($Out_Format) {
+    $Out_Format = 'genbank' if ($Out_Format =~ /(gbk|gb)/i);
+} else { # same format as input file
+    if (!-t STDIN) {
+        $Out_Format = $In_Format;
+    } else {
+        if (ref($Seqin) =~ /Bio::SeqIO::(genbank|embl|fasta)/) { # from bioperl guessing
+            $Out_Format = $1;
+        } else {
+            die "\n### Fatal error: Could not determine input file format, please set an output file format with option '-o'!\n";
+        }
+    }
+}
+$Seqout = Bio::SeqIO->new(-fh => \*STDOUT, -format => $Out_Format); # printing to STDOUT requires '-format'
+
+
+### Write reverse complemented sequence (and its features) to STDOUT
+while (my $seq_obj = $Seqin->next_seq) { # Bio::Seq object; for multi-seq files
+    my $revcom = Bio::SeqUtils->revcom_with_features($seq_obj);
+    $Seqout->write_seq($revcom);
+}
+
+exit;