annotate htseq.pl @ 4:e0e1518e56f6 draft

Uploaded
author rouan
date Thu, 26 Dec 2013 05:35:56 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
4
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
1 #!/usr/bin/perl
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
2
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
3 use strict;
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
4 use warnings;
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
5 use Getopt::Std;
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
6 use File::Basename;
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
7 $| = 1;
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
8
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
9 # Grab and set all options
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
10 my %OPTIONS = (a => 0, i => "gene_id", m => "intersection-nonempty", s => "no", t => "exon");
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
11 getopts('a:cg:i:m:o:r:s:t:', \%OPTIONS);
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
12
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
13 die qq(
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
14 Usage: HTSeq.pl [OPTIONS] Group1=sample1=<SAM/BAM file> [Group1=sample2=<SAM/BAM file> ... Group2=sampleN=<SAM/BAM file> ...]
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
15
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
16 OPTIONS: -a STR skip all reads with alignment quality lower than the given minimum value (default: $OPTIONS{a})
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
17 -c reduce the matrix by removing any feature with no counts
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
18 -g STR the features file in the GFF/GTF format
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
19 -i STR GFF attribute to be used as feature ID (default: $OPTIONS{i})
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
20 -m STR mode to handle reads overlapping more than one feature. Possible values for <mode> are union, intersection-strict and intersection-nonempty (default: $OPTIONS{m})
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
21 -o STR output file name for expression matrix
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
22 -r STR the name of the output report
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
23 -s STR whether the data is from a strand-specific assay (default: $OPTIONS{s})
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
24 -t STR feature type (3rd column in GFF file) to be used, all features of other type are ignored (default, suitable for RNA-Seq and Ensembl GTF files: $OPTIONS{t})
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
25
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
26 ) if(@ARGV == 0);
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
27
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
28 my $sam_out;
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
29 my @counts;
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
30 my @features;
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
31 my %report;
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
32 my @samplenames;
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
33 my $current_group;
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
34 my @groups;
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
35 my @files;
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
36 my $groupcount = 0;
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
37 my %grouphash;
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
38
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
39 foreach my $input (@ARGV) {
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
40 my ($group, $sample, $input) = split "::", $input;
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
41 if(! defined $grouphash{$group}) {
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
42 $groupcount++;
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
43 $grouphash{$group} = "G${groupcount}:$group";
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
44 }
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
45 push @groups, $grouphash{$group};
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
46 push @samplenames, $sample;
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
47 push @files, $input;
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
48 }
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
49
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
50 for(my $index = 0; $index <= $#files; $index++) {
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
51 my $input_file = $files[$index];
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
52 my $sample = $samplenames[$index];
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
53
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
54 # run htseq
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
55 my @htseq;
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
56 my $COMM;
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
57 my $file_type = `file $input_file`;
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
58 if(grep /text$/, $file_type ) {
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
59 $COMM = "htseq-count -q -m $OPTIONS{m} -s $OPTIONS{s} -a $OPTIONS{a} -t $OPTIONS{t} -i $OPTIONS{i} $input_file $OPTIONS{g}";
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
60 @htseq = `$COMM`;
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
61 } else {
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
62 $COMM = "samtools view $input_file | htseq-count -q -m $OPTIONS{m} -s $OPTIONS{s} -a $OPTIONS{a} -t $OPTIONS{t} -i $OPTIONS{i} - $OPTIONS{g}";
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
63 @htseq = `$COMM`;
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
64 }
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
65
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
66 my $row = 0;
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
67 $report{$sample} = "Command Used: $COMM\n";
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
68
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
69 for(my $row = 0; $row <= $#htseq; $row++) {
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
70 # store the report is an hash
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
71 if(grep /^no_feature|^ambiguous|^too_low_aQual|^not_aligned|^alignment_not_unique/, $htseq[$row]) {
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
72 $report{$sample} .= $htseq[$row];
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
73 } else {
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
74 # store the counts in a matrix
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
75 chomp $htseq[$row];
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
76 my ($feature, $value) = split "\t", $htseq[$row];
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
77 $counts[$row][$index] = $value;
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
78 if($input_file eq $files[0]) {
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
79 push @features, $feature;
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
80 }
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
81 }
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
82 }
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
83 }
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
84
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
85 # print the matrix
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
86 open(MATRIX, ">$OPTIONS{o}") || die "Could Not Create Output File $OPTIONS{o}!\n";
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
87 print MATRIX "#\t".join("\t", @groups)."\n";
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
88 print MATRIX "#Feature\t".join("\t", @samplenames)."\n";
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
89 for(my $row = 0; $row <= $#features; $row++) {
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
90 if(defined $OPTIONS{c}) {
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
91 my $rowsum = 0;
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
92 $rowsum += $_ foreach @{ $counts[$row] };
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
93 if(!$rowsum) {
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
94 next;
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
95 }
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
96 }
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
97 print MATRIX "$features[$row]\t".join("\t", @{ $counts[$row] })."\n";
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
98 }
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
99 close(MATRIX);
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
100
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
101 # print the report
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
102 open(REPORT, ">$OPTIONS{r}") || die "Could Not Create Output File $OPTIONS{r}!\n";
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
103 print REPORT "$groups[$_]:$samplenames[$_]\n$report{$samplenames[$_]}\n" foreach (0..$#samplenames);
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
104 close(REPORT);
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
105
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
106
e0e1518e56f6 Uploaded
rouan
parents:
diff changeset
107