Mercurial > repos > antmarge > compstrains
comparison compStrains.pl @ 19:8ea6bb9da985 draft default tip
Uploaded
| author | antmarge |
|---|---|
| date | Wed, 29 Mar 2017 18:04:58 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 18:12ba8fbec776 | 19:8ea6bb9da985 |
|---|---|
| 1 #!/usr/bin/perl -w | |
| 2 | |
| 3 #Margaret Antonio 16.01.13 | |
| 4 | |
| 5 #DESCRIPTION: Takes two aggregate.pl outputs and compares them using mean difference, pval for each | |
| 6 #gene. Can compare, for example, 19F in glucose and TIGR4 in glucose. | |
| 7 #DIFFERENT GENOMES (ie. diff. strains). | |
| 8 #Requires CONVERSION FILE | |
| 9 | |
| 10 | |
| 11 use Data::Dumper; | |
| 12 use strict; | |
| 13 use Getopt::Long; | |
| 14 #use warnings; | |
| 15 use File::Path; | |
| 16 use File::Basename; | |
| 17 use Statistics::Distributions; | |
| 18 | |
| 19 #ASSIGN INPUTS TO VARIABLES USING FLAGS | |
| 20 our ($input1,$input2,$out,$sortkey,$round,$l1,$l2,$cfile); | |
| 21 GetOptions( | |
| 22 'o:s' =>\$out, | |
| 23 's:i' => \$sortkey, | |
| 24 'r:i'=> \$round, | |
| 25 'l1:s'=> \$l1, | |
| 26 'l2:s'=> \$l2, | |
| 27 'c:s'=> \$cfile, | |
| 28 'input1:s'=>\$input1, | |
| 29 'input2:s'=>\$input2 | |
| 30 ); | |
| 31 | |
| 32 | |
| 33 #THE @files ARRAY WILL CONTAIN INPUT FILE NAMES | |
| 34 my @files=($input1,$input2); | |
| 35 | |
| 36 #GET LABELS: | |
| 37 my @labels = ($l1,$l2); | |
| 38 | |
| 39 sub cleaner{ | |
| 40 my $line=$_[0]; | |
| 41 chomp($line); | |
| 42 $line =~ s/\x0d{0,1}\x0a{0,1}\Z//s; | |
| 43 return $line; | |
| 44 } | |
| 45 | |
| 46 | |
| 47 #CHECK IF REQ. VARIABLES WERE DEFINED USING FLAGS. IF NOT THEN USE DEFAULT VALUES | |
| 48 if (!$out) {$out="comp.".$labels[0].$labels[1].".csv"} | |
| 49 if (!$round){$round='%.4f'} | |
| 50 | |
| 51 #OPEN INPUTTED AGGREGATE GENE FILES AND STORE THEIR CONTENTS INTO TWO HASHES | |
| 52 #FILE1 GOES INTO HASH %ONE AND FILE2 GOES INTO HASH %TWO. | |
| 53 | |
| 54 #FILE1 OPENING ---> %one WHERE KEY:VALUE IS GENE_ID:(GENE_ID,INSERTIONS,MEAN,ETC.) | |
| 55 my @header; | |
| 56 my %one; | |
| 57 | |
| 58 open (F1,'<',$files[0]); | |
| 59 | |
| 60 #STORE COLUMN NAMES (FIRST LINE OF FILE1) FOR HEADER AND APPEND LABELS | |
| 61 my $head=<F1>; #the header in the file | |
| 62 my @cols=split(',',$head); | |
| 63 @cols=@cols[0,1,2,3,4,5]; #get rid of blank columns | |
| 64 for (my $j=0;$j<scalar @cols;$j++){ | |
| 65 $cols[$j]=cleaner($cols[$j]).'-'.$labels[0]; #mark each column name with file it comes from | |
| 66 } | |
| 67 push (@header,@cols); | |
| 68 | |
| 69 while (my $line=<F1>){ | |
| 70 chomp $line; | |
| 71 my @info=split(",",$line); | |
| 72 #Only keep the first 6 columns (Ones about blanks aren't needed for comparisons) | |
| 73 @info=@info[0,1,2,3,4,5]; | |
| 74 #Sometimes genes that don't have a gene name can't be blank, so fill with NA | |
| 75 if (!$info[5]){ | |
| 76 $info[5]=""; | |
| 77 } | |
| 78 | |
| 79 $one{$info[0]}=\@info; | |
| 80 } | |
| 81 close F1; | |
| 82 | |
| 83 #FILE2 OPENING ---> %two WHERE KEY:VALUE IS GENE_ID:(GENE_ID,INSERTIONS,MEAN,ETC.) | |
| 84 | |
| 85 my %two; | |
| 86 open (F2,'<',$files[1]); | |
| 87 | |
| 88 #STORE COLUMN NAMES (FIRST LINE OF FILE2) FOR HEADER AND APPEND LABELS | |
| 89 $head=<F2>; #the header in the file | |
| 90 @cols=split(',',$head); | |
| 91 @cols=@cols[0,1,2,3,4,5]; #get rid of blank columns | |
| 92 for (my $j=0;$j<scalar @cols;$j++){ | |
| 93 $cols[$j]=cleaner($cols[$j]).'-'.$labels[1]; #mark each column name with file it comes from | |
| 94 } | |
| 95 push (@header,@cols); | |
| 96 | |
| 97 while (my $line=<F2>){ | |
| 98 $line = cleaner($line); | |
| 99 my @info=split(",",$line); | |
| 100 @info=@info[0,1,2,3,4,5]; | |
| 101 if (!$info[5]){ | |
| 102 $info[5]=""; | |
| 103 } | |
| 104 | |
| 105 $two{$info[0]}=\@info; | |
| 106 } | |
| 107 close F2; | |
| 108 | |
| 109 | |
| 110 #READ CONVERSION FILE INTO ARRAY. | |
| 111 #Conversion file must have strain 1 for file 1 in column 1 (index 0) and | |
| 112 #strain 2 for file 2 in column 2 (index 1) | |
| 113 #conversion file must be tab delimited with no NA fields | |
| 114 #If homologs (exist then take info from hashes (%one and %two) by referring to gene_id in KEY | |
| 115 | |
| 116 my @all; #store all homologs in this hash | |
| 117 open (CONV,'<',$cfile); | |
| 118 while (my $line=<CONV>){ | |
| 119 $line = cleaner($line); | |
| 120 my @genes=split("\t",$line); #Array @genes will contain two genes (SP_0000,SPT_0000) | |
| 121 if (scalar @genes==2 and (exists $one{$genes[0]}) and (exists $two{$genes[1]})){ | |
| 122 my @info; | |
| 123 my @oneArray=@{$one{$genes[0]}}; | |
| 124 my @twoArray=@{$two{$genes[1]}}; | |
| 125 push (@info,@oneArray,@twoArray); | |
| 126 # Fitness values at index 1 and 7 | |
| 127 my $diff=sprintf("$round",($info[1]-$info[7])); | |
| 128 my $total1=$info[2]; | |
| 129 my $total2=$info[8]; | |
| 130 if (!$info[3] or $info[3] eq ""){$info[3]="NA"}; | |
| 131 if (!$info[4] or $info[4] eq ""){$info[4]="NA"}; | |
| 132 if (!$info[9] or $info[9] eq ""){$info[9]="NA"}; | |
| 133 if (!$info[10] or $info[10] eq ""){$info[10]="NA"}; | |
| 134 my $sd1=$info[3]; | |
| 135 my $se1=$info[4]; | |
| 136 my $sd2=$info[9]; | |
| 137 my $se2=$info[10]; | |
| 138 my $df=$total1+$total2-2; | |
| 139 my ($tdist,$pval); | |
| 140 #TDIST, PVAL calculations with fail if standard dev, error, or counts are not real numbers | |
| 141 #or if 0 ends up in denominator | |
| 142 if ($sd1 eq "NA" or $sd2 eq "NA" or $total1==0 or $total2==0 or $sd1==0 or $sd2==0){ | |
| 143 ($tdist,$pval)=("NA","NA"); | |
| 144 } | |
| 145 else{ | |
| 146 $tdist=sqrt((($diff)/(sqrt((($sd1**2)/$total1)+(($sd2**2)/$total2))))**2); | |
| 147 $pval=Statistics::Distributions::tprob($df,$tdist); | |
| 148 } | |
| 149 push (@info,$diff,$df,$tdist,$pval); | |
| 150 push (@all,\@info); | |
| 151 } | |
| 152 } | |
| 153 close CONV; | |
| 154 | |
| 155 #SORT THE HOMOLOGS BY THE SORTKEY OR BY DEFAULT DIFFERENCE IN MEAN FITNESSES | |
| 156 if (!$sortkey){ | |
| 157 $sortkey=12; #for mean difference | |
| 158 } | |
| 159 my @sorted = sort { $b->[$sortkey] <=> $a->[$sortkey] } @all; | |
| 160 | |
| 161 #FINISH THE HEADER BY ADDING COLUMN NAMES FOR MEAN-DIFF, DOF, TDIST, AND PVALUE | |
| 162 my $field="MeanDiff(".$labels[0].'.'.$labels[1].")"; | |
| 163 push (@header,$field,"DOF","TDIST","PVALUE"); | |
| 164 | |
| 165 #PRINT MATCHED HOMOLOG INFORMATION INTO A SINGLE OUTPUT FILE | |
| 166 open OUT, '>',"$out"; | |
| 167 print OUT (join(',',@header),"\n"); | |
| 168 foreach (@sorted){ | |
| 169 my @comparison=@{$_}; | |
| 170 print OUT join(',',@comparison),"\n"; | |
| 171 } | |
| 172 | |
| 173 close OUT; | |
| 174 | |
| 175 |
