Mercurial > repos > fcaramia > edger
comparison htseq.pl @ 4:ebd59bc6855c draft
Uploaded
author | fcaramia |
---|---|
date | Wed, 12 Sep 2012 23:45:33 -0400 |
parents | |
children | 6324eefd9e91 |
comparison
equal
deleted
inserted
replaced
3:6965066838fc | 4:ebd59bc6855c |
---|---|
1 #!/usr/bin/perl | |
2 | |
3 use strict; | |
4 use warnings; | |
5 use Getopt::Std; | |
6 use File::Basename; | |
7 $| = 1; | |
8 | |
9 # Grab and set all options | |
10 my %OPTIONS = (a => 0, i => "gene_id", m => "intersection-nonempty", s => "no", t => "exon"); | |
11 getopts('a:cg:i:m:o:r:s:t:', \%OPTIONS); | |
12 | |
13 die qq( | |
14 Usage: HTSeq.pl [OPTIONS] Group1=sample1=<SAM/BAM file> [Group1=sample2=<SAM/BAM file> ... Group2=sampleN=<SAM/BAM file> ...] | |
15 | |
16 OPTIONS: -a STR skip all reads with alignment quality lower than the given minimum value (default: $OPTIONS{a}) | |
17 -c reduce the matrix by removing any feature with no counts | |
18 -g STR the features file in the GFF/GTF format | |
19 -i STR GFF attribute to be used as feature ID (default: $OPTIONS{i}) | |
20 -m STR mode to handle reads overlapping more than one feature. Possible values for <mode> are union, intersection-strict and intersection-nonempty (default: $OPTIONS{m}) | |
21 -o STR output file name for expression matrix | |
22 -r STR the name of the output report | |
23 -s STR whether the data is from a strand-specific assay (default: $OPTIONS{s}) | |
24 -t STR feature type (3rd column in GFF file) to be used, all features of other type are ignored (default, suitable for RNA-Seq and Ensembl GTF files: $OPTIONS{t}) | |
25 | |
26 ) if(@ARGV == 0); | |
27 | |
28 my $sam_out; | |
29 my @counts; | |
30 my @features; | |
31 my %report; | |
32 my @samplenames; | |
33 my $current_group; | |
34 my @groups; | |
35 my @files; | |
36 my $groupcount = 0; | |
37 my %grouphash; | |
38 | |
39 foreach my $input (@ARGV) { | |
40 my ($group, $sample, $input) = split "::", $input; | |
41 if(! defined $grouphash{$group}) { | |
42 $groupcount++; | |
43 $grouphash{$group} = "G${groupcount}:$group"; | |
44 } | |
45 push @groups, $grouphash{$group}; | |
46 push @samplenames, $sample; | |
47 push @files, $input; | |
48 } | |
49 | |
50 for(my $index = 0; $index <= $#files; $index++) { | |
51 my $input_file = $files[$index]; | |
52 my $sample = $samplenames[$index]; | |
53 | |
54 # run htseq | |
55 my @htseq; | |
56 my $COMM; | |
57 my $file_type = `file $input_file`; | |
58 if(grep /text$/, $file_type ) { | |
59 $COMM = "htseq-count -q -m $OPTIONS{m} -s $OPTIONS{s} -a $OPTIONS{a} -t $OPTIONS{t} -i $OPTIONS{i} $input_file $OPTIONS{g}"; | |
60 @htseq = `$COMM`; | |
61 } else { | |
62 $COMM = "samtools view $input_file | htseq-count -q -m $OPTIONS{m} -s $OPTIONS{s} -a $OPTIONS{a} -t $OPTIONS{t} -i $OPTIONS{i} - $OPTIONS{g}"; | |
63 @htseq = `$COMM`; | |
64 } | |
65 | |
66 my $row = 0; | |
67 $report{$sample} = "Command Used: $COMM\n"; | |
68 | |
69 for(my $row = 0; $row <= $#htseq; $row++) { | |
70 # store the report is an hash | |
71 if(grep /^no_feature|^ambiguous|^too_low_aQual|^not_aligned|^alignment_not_unique/, $htseq[$row]) { | |
72 $report{$sample} .= $htseq[$row]; | |
73 } else { | |
74 # store the counts in a matrix | |
75 chomp $htseq[$row]; | |
76 my ($feature, $value) = split "\t", $htseq[$row]; | |
77 $counts[$row][$index] = $value; | |
78 if($input_file eq $files[0]) { | |
79 push @features, $feature; | |
80 } | |
81 } | |
82 } | |
83 } | |
84 | |
85 # print the matrix | |
86 open(MATRIX, ">$OPTIONS{o}") || die "Could Not Create Output File $OPTIONS{o}!\n"; | |
87 print MATRIX "#\t".join("\t", @groups)."\n"; | |
88 print MATRIX "#Feature\t".join("\t", @samplenames)."\n"; | |
89 for(my $row = 0; $row <= $#features; $row++) { | |
90 if(defined $OPTIONS{c}) { | |
91 my $rowsum = 0; | |
92 $rowsum += $_ foreach @{ $counts[$row] }; | |
93 if(!$rowsum) { | |
94 next; | |
95 } | |
96 } | |
97 print MATRIX "$features[$row]\t".join("\t", @{ $counts[$row] })."\n"; | |
98 } | |
99 close(MATRIX); | |
100 | |
101 # print the report | |
102 open(REPORT, ">$OPTIONS{r}") || die "Could Not Create Output File $OPTIONS{r}!\n"; | |
103 print REPORT "$groups[$_]:$samplenames[$_]\n$report{$samplenames[$_]}\n" foreach (0..$#samplenames); | |
104 close(REPORT); | |
105 | |
106 | |
107 |