diff COG/bac-genomics-scripts/trunc_seq/trunc_seq.pl @ 3:e42d30da7a74 draft

Uploaded
author dereeper
date Thu, 30 May 2024 11:52:25 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/COG/bac-genomics-scripts/trunc_seq/trunc_seq.pl	Thu May 30 11:52:25 2024 +0000
@@ -0,0 +1,340 @@
+#!/usr/bin/perl
+
+#######
+# POD #
+#######
+
+=pod
+
+=head1 NAME
+
+C<trunc_seq.pl> - truncate sequence files
+
+=head1 SYNOPSIS
+
+C<perl trunc_seq.pl 20 3500 seq-file.embl E<gt>
+seq-file_trunc_20_3500.embl>
+
+B<or>
+
+C<perl trunc_seq.pl file_of_filenames_and_coords.tsv>
+
+=head1 DESCRIPTION
+
+This script truncates sequence files according to the given
+coordinates. The features/annotations in RichSeq files (e.g. EMBL or
+GENBANK format) will also be adapted accordingly. Use option B<-o> to
+specify a different output sequence format. Input can be given directly
+as a file and truncation coordinates to the script, with the start
+position as the first argument, stop as the second and (the path to)
+the sequence file as the third. In this case the truncated sequence
+entry is printed to C<STDOUT>. Input sequence files should contain only
+one sequence entry, if a multi-sequence file is used as input only the
+B<first> sequence entry is truncated.
+
+Alternatively, a file of filenames (fof) with respective coordinates
+and sequence files in the following B<tab-separated> format can be
+given to the script (the header is optional):
+
+ #start\tstop\tseq-file
+ 300\t9000\t(path/to/)seq-file
+ 50\t1300\t(path/to/)seq-file2
+
+With a fof the resulting truncated sequence files are printed into a
+results directory. Use option B<-r> to specify a different results
+directory than the default.
+
+It is also possible to truncate a RichSeq sequence file loaded into the
+L<Artemis|http://www.sanger.ac.uk/science/tools/artemis> genome browser
+from the Sanger Institute: Select a subsequence and then go to Edit
+-E<gt> Subsequence (and Features)
+
+=head1 OPTIONS
+
+=over 20
+
+=item B<-h>, B<-help>
+
+Help (perldoc POD)
+
+=item B<-o>=I<str>, B<-outformat>=I<str>
+
+Specify different sequence format for the output (files) [fasta, embl,
+or gbk]
+
+=item B<-r>=I<str>, B<-result_dir>=I<str>
+
+Path to result folder for fof input [default = './trunc_seq_results']
+
+=item B<-v>, B<-version>
+
+Print version number to C<STDOUT>
+
+=back
+
+=head1 OUTPUT
+
+=over 20
+
+=item C<STDOUT>
+
+If a single sequence file is given to the script the truncated sequence
+file is printed to C<STDOUT>. Redirect or pipe into another tool as
+needed.
+
+=back
+
+B<or>
+
+=over 20
+
+=item F<./trunc_seq_results>
+
+If a fof is given to the script, all output files are stored in a
+results folder
+
+=item F<./trunc_seq_results/seq-file_trunc_start_stop.format>
+
+Truncated output sequence files are named appended with 'trunc' and the
+corresponding start and stop positions
+
+=back
+
+=head1 EXAMPLES
+
+=over
+
+=item C<perl trunc_seq.pl -o gbk 120 30000 seq-file.embl E<gt>
+seq-file_trunc_120_3000.gbk>
+
+=back
+
+B<or>
+
+=over
+
+=item C<perl trunc_seq.pl -o fasta 5300 18500 seq-file.gbk | perl
+revcom_seq.pl -i fasta E<gt> seq-file_trunc_revcom.fasta>
+
+=back
+
+B<or>
+
+=over
+
+=item C<perl trunc_seq.pl -r path/to/trunc_embl_dir -o embl
+file_of_filenames_and_coords.tsv>
+
+=back
+
+=head1 DEPENDENCIES
+
+=over
+
+=item B<L<BioPerl|http://www.bioperl.org>>
+
+Tested with BioPerl version 1.007001
+
+=back
+
+=head1 VERSION
+
+ 0.2                                               update: 2015-12-07
+ 0.1                                                       2013-08-02
+
+=head1 AUTHOR
+
+ Andreas Leimbach                               aleimba[at]gmx[dot]de
+
+=head1 LICENSE
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3 (GPLv3) of the
+License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program. If not, see L<http://www.gnu.org/licenses/>.
+
+=cut
+
+
+########
+# MAIN #
+########
+
+use strict;
+use warnings;
+use autodie;
+use Getopt::Long;
+use Pod::Usage;
+use Bio::SeqIO; # bioperl module to handle sequence input/output
+#use Bio::Seq; # bioperl module to handle sequences with features ### apparently not needed, methods inherited
+#use Bio::SeqUtils; # bioperl module with additional methods (including features) for Bio::Seq objects ### apparently not needed, methods inherited
+
+### Get options with Getopt::Long
+my $Out_Format_Opt; # optional different output seq file format
+my $Result_Dir = 'trunc_seq_results'; # path to result folder for fof input
+my $VERSION = 0.2;
+my ($Opt_Version, $Opt_Help);
+GetOptions ('outformat=s' => \$Out_Format_Opt,
+            'result_dir=s' => \$Result_Dir,
+            'version' => \$Opt_Version,
+            'help|?' => \$Opt_Help)
+            or pod2usage(-verbose => 1, -exitval => 2);
+
+
+
+### Run perldoc on POD
+pod2usage(-verbose => 2) if ($Opt_Help);
+if ($Opt_Version) {
+    print "$0 $VERSION\n";
+    exit;
+}
+
+
+
+### Check input (@ARGV); didn't include STDIN as input option, too complicated here with fof etc.
+my $Fof; # file of filenames (fof) with truncation coords
+my $Start;
+my $Stop;
+my $Seq_File;
+if (@ARGV < 1 || @ARGV == 2 || @ARGV > 3) {
+    my $warning = "\n### Fatal error: Give either three arguments,\n$0\tstart\tstop\tseq-file\nor one file of sequence filenames with truncation coordinates as argument! Please see the usage with option '-h' if unclear!\n";
+    pod2usage(-verbose => 0, -message => $warning, -exitval => 2);
+} elsif (@ARGV == 1) { # fof
+    check_file_exists($ARGV[0]); # subroutine to check for file existence
+    $Fof = shift;
+} elsif (@ARGV == 3) {
+    check_file_exists($ARGV[2]); # subroutine
+    if ($ARGV[0] !~ /^\d+$/ || $ARGV[1] !~ /^\d+$/) {
+        my $warning = "\n### Fatal error: With a single sequence file input the first and second arguments are the start and stop positions for truncation, and need to include ONLY digits:\n$0\tstart\tstop\tseq-file\nPlease see the usage with option '-h' if unclear!\n";
+        pod2usage(-verbose => 0, -message => $warning, -exitval => 2);
+    }
+    ($Start, $Stop, $Seq_File) = @ARGV;
+}
+
+
+
+### Truncate the sequence and write either to STDOUT for single seq file input or output files for fof
+if ($Fof) {
+    open (my $fof_fh, "<", "$Fof");
+
+    # create result folder
+    $Result_Dir =~ s/\/$//; # get rid of a potential '/' at the end of $Result_Dir path
+    if (-e $Result_Dir) {
+        empty_dir($Result_Dir); # subroutine to empty a directory with user interaction
+    } else {
+        mkdir $Result_Dir;
+    }
+
+    while (my $line = <$fof_fh>) {
+        chomp $line;
+        next if ($line =~ /^\s*$/ || $line =~ /^#/); # skip empty or comment lines
+
+        die "\n### Fatal error: Line '$.' of the '$Fof' file of sequence filenames plus truncation coordinates does not include the mandatory tab-separated two NUMERICAL start and stop truncation positions, and the sequence file (without any other whitespaces):\nstart\tstop\tpath/to/seq-file\n" if ($line !~ /^\d+\t\d+\t\S+$/);
+        ($Start, $Stop, $Seq_File) = split(/\t/, $line);
+        check_file_exists($Seq_File); # subroutine
+
+        my ($seqin, $truncseq) = trunc_seq($Start, $Stop, $Seq_File); # subroutine to create a Bio::SeqIO input object and truncate the respective Bio::Seq object
+        my $seqout = seq_out($seqin, $Start, $Stop, $Seq_File); # subroutine to create a Bio::SeqIO output object, $seqin needed for format guessing, $Start/$Stop/$Seq_File needed for output filenames
+        $seqout->write_seq($truncseq);
+    }
+    close $fof_fh;
+
+} else { # single seq file, @ARGV == 3
+    my ($seqin, $truncseq) = trunc_seq($Start, $Stop, $Seq_File); # subroutine
+    my $seqout = seq_out($seqin); # subroutine, without $Start/$Stop/$Seq_file for STDOUT output
+    $seqout->write_seq($truncseq);
+}
+
+exit;
+
+
+
+###############
+# Subroutines #
+###############
+
+### Subroutine to check if file exists
+sub check_file_exists {
+    my $file = shift;
+    die "\n### Fatal error: File '$file' does not exist: $!\n" if (!-e $file);
+}
+
+
+
+### Subroutine to empty a directory with user interaction
+sub empty_dir {
+    my $dir = shift;
+    print STDERR "\nDirectory '$dir' already exists! You can use either option '-r' to set a different output result directory name, or do you want to replace the directory and all its contents [y|n]? ";
+    my $user_ask = <STDIN>;
+    if ($user_ask =~ /y/i) {
+        unlink glob "$dir/*"; # remove all files in results directory
+    } else {
+        die "\nScript abborted!\n";
+    }
+    return 1;
+}
+
+
+
+### Subroutine to create a Bio::SeqIO output object
+sub seq_out {
+    my ($seqin, $start, $stop, $seq_file) = @_;
+
+    my $out_format; # need to keep $Out_Format_Opt for several seq files with fof
+    if ($Out_Format_Opt) {
+        $Out_Format_Opt = 'genbank' if ($Out_Format_Opt =~ /(gbk|gb)/i); # allow shorter input for GENBANK format
+        $out_format = $Out_Format_Opt;
+    } else { # same format as input file
+        if (ref($seqin) =~ /Bio::SeqIO::(genbank|embl|fasta)/) { # from bioperl guessing
+            $out_format = $1;
+        } else {
+            die "\n### Fatal error: Could not determine input file format, please set an output file format with option '-o'!\n";
+        }
+    }
+
+    my $seqout; # Bio::SeqIO object
+    if ($seq_file) { # fof
+        $seq_file =~ s/\S+(\/|\\)//; # remove input filepaths, aka 'basename' ('/' for Unix and '\' for Windows)
+        my $file_ext;
+        if ($out_format eq 'genbank') {
+            $file_ext = 'gbk'; # back to shorter file extension for GENBANK format
+        } else {
+            $file_ext = $out_format;
+        }
+        $seq_file =~ s/^(\S+)\.\w+$/$Result_Dir\/$1\_trunc_$start\_$stop\.$file_ext/; # append also result directory to output filename
+        $seqout = Bio::SeqIO->new(-file => ">$seq_file", -format => $out_format);
+
+    } else { # single seq file input
+        $seqout = Bio::SeqIO->new(-fh => \*STDOUT, -format => $out_format); # printing to STDOUT requires '-format'
+    }
+
+    return $seqout;
+}
+
+
+
+### Subroutine create a Bio::SeqIO input object and truncate the respective Bio::Seq object
+sub trunc_seq {
+    my ($start, $stop, $seq_file) = @_;
+    print STDERR "\nTruncating \"$seq_file\" to coordinates $start..$stop ...\n";
+    my $seqin = Bio::SeqIO->new(-file => "<$seq_file"); # Bio::SeqIO object; no '-format' given, leave it to bioperl guessing
+    my $count = 0;
+    my $truncseq;
+    while (my $seq_obj = $seqin->next_seq) { # Bio::Seq object
+        $count++;
+        if ($count > 1) {
+            warn "\n### Warning: More than one sequence entry in sequence file '$seq_file', but only the FIRST sequence entry will be truncated and printed to STDOUT or a result file!\n\n";
+            last;
+        }
+        $truncseq = Bio::SeqUtils->trunc_with_features($seq_obj, $start, $stop);
+    }
+    return ($seqin, $truncseq); # $seqin needed for outformat guessing in subroutine seqout
+}