Mercurial > repos > marpiech > norwich_tools_docking
comparison tools/rdock/bin/sdreport @ 3:b02d74d22d05 draft default tip
planemo upload
| author | marpiech |
|---|---|
| date | Mon, 29 Aug 2016 08:23:52 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 2:bd50f811878f | 3:b02d74d22d05 |
|---|---|
| 1 #!/usr/bin/perl | |
| 2 # Produces text summaries of SD records * | |
| 3 # * | |
| 4 # Usage: sdreport [-l] [-t] [-s<DataField>] [sdFiles] * | |
| 5 # * | |
| 6 # -l output data fields for each record as processed * | |
| 7 # -t tabulate Rbt.Score.* fields for each record as processed * | |
| 8 # -s summarise data fields for all records * | |
| 9 # -s<DataField> summarise data fields for each unique value * | |
| 10 # of <DataField> * | |
| 11 # * | |
| 12 # Note: If -l or -t are combined with -s, the listing/table is output * | |
| 13 # within each ligand summary * | |
| 14 # If sdFiles not given, reads from standard input * | |
| 15 # Output is to standard output * | |
| 16 # Default is equivalent to sdfilter -l * | |
| 17 #******************************************************************************* | |
| 18 use lib "$ENV{'RBT_ROOT'}/lib"; | |
| 19 | |
| 20 use SDRecord; | |
| 21 | |
| 22 # Default field names and headings for rDock v4.00 scores | |
| 23 my @defaultFields = ('SCORE','SCORE.INTER','SCORE.INTRA','SCORE.RESTR','SCORE.INTER.VDW'); | |
| 24 my @defaultHeadings = ('TOTAL','INTER','INTRA','RESTR','VDW'); | |
| 25 | |
| 26 # Default field names and headings for normalised scores (score / #ligand heavy atoms) | |
| 27 my @defaultNormFields = ('SCORE.norm','SCORE.INTER.norm','SCORE.INTRA.norm','SCORE.RESTR.norm','SCORE.heavy'); | |
| 28 my @defaultNormHeadings = ('TOTALn','INTERn','INTRAn','RESTRn','#heavy'); | |
| 29 | |
| 30 # Default field names and headings for rDock v3.00 scores | |
| 31 my @defaultOldFields = ('Rbt.Score.Corrected','Rbt.Score.Inter','Rbt.Score.Intra','Rbt.Score.IntraMin','Rbt.Score.Restraint'); | |
| 32 my @defaultOldHeadings = ('TOTAL','INTER','INTRA','INTRAMIN','RESTR'); | |
| 33 | |
| 34 my $listFormat = 0; | |
| 35 my $summaryFormat = 0; | |
| 36 my $tableFormat = 0; | |
| 37 my $supplierFormat = 0; | |
| 38 my $csvFormat = 0; | |
| 39 my $summaryKey = "_TITLE1"; | |
| 40 my $oldFields = 0;#If true, use old default field names for component scores | |
| 41 my $headings = 1;#DM 21 Nov 2000, If false, don't output headings | |
| 42 my @outputFields; | |
| 43 my @outputHeadings; | |
| 44 | |
| 45 #Print help if no command line arguments | |
| 46 printHelpAndExit() if (scalar(@ARGV) == 0); | |
| 47 | |
| 48 #Parse command line arguments | |
| 49 my $nArgs = scalar(@ARGV); | |
| 50 | |
| 51 while (scalar(@ARGV)) { | |
| 52 my $arg = shift @ARGV; | |
| 53 printHelpAndExit() if ($arg eq '-h'); | |
| 54 if (index($arg,'-l')==0) { | |
| 55 $listFormat = 1; | |
| 56 } | |
| 57 elsif (index($arg,'-o')==0) { | |
| 58 $oldFields = 1; | |
| 59 } | |
| 60 # 7 Feb 2005 (DM) Option to report normalised aggregate scores | |
| 61 elsif (index($arg,'-norm')==0) { | |
| 62 $oldFields = 2; | |
| 63 } | |
| 64 elsif (index($arg,'-sup')==0) { | |
| 65 $supplierFormat = 1; | |
| 66 } | |
| 67 elsif (index($arg,'-s')==0) { | |
| 68 $summaryFormat = 1; | |
| 69 } | |
| 70 elsif (index($arg,'-id')==0) { | |
| 71 $summaryKey = substr($arg,3); | |
| 72 } | |
| 73 elsif (index($arg,'-nh')==0) { | |
| 74 $headings = 0; | |
| 75 } | |
| 76 elsif (index($arg,'-t')==0) { | |
| 77 $tableFormat = 1; | |
| 78 push @outputFields, split(',',substr($arg,2)); | |
| 79 push @outputHeadings, @outputFields; | |
| 80 } | |
| 81 elsif (index($arg,'-c')==0) { | |
| 82 $csvFormat = 1; | |
| 83 push @outputFields, split(',',substr($arg,2)); | |
| 84 push @outputHeadings, @outputFields; | |
| 85 } | |
| 86 else { | |
| 87 push @files,$arg;#must be a filename | |
| 88 } | |
| 89 } | |
| 90 push @ARGV,@files;#put the filenames back in the arg list | |
| 91 | |
| 92 #use -l if neither table format is specified | |
| 93 $listFormat = (!$tableFormat && !$csvFormat && !$supplierFormat); | |
| 94 | |
| 95 #If no output fields defined for -t or -c use the defaults (old or new) | |
| 96 if (scalar(@outputFields)==0) { | |
| 97 if ($oldFields == 1) { | |
| 98 @outputFields = @defaultOldFields; | |
| 99 @outputHeadings = @defaultOldHeadings; | |
| 100 } | |
| 101 elsif ($oldFields == 2) { | |
| 102 @outputFields = @defaultNormFields; | |
| 103 @outputHeadings = @defaultNormHeadings; | |
| 104 } | |
| 105 else { | |
| 106 @outputFields = @defaultFields; | |
| 107 @outputHeadings = @defaultHeadings; | |
| 108 } | |
| 109 } | |
| 110 | |
| 111 my $sdRec = new SDRecord; | |
| 112 my %summary;#hash of SDRecord lists, indexed by user-defined summary key | |
| 113 my %indexByName; | |
| 114 my %indexByNum; | |
| 115 my $idx = 0; | |
| 116 my $nRec = 0; | |
| 117 | |
| 118 #Column headings for tab and csv format | |
| 119 #DM 21 Nov 2000 - if $headings is false, then don't output the column headings | |
| 120 if ($tableFormat && !$summaryFormat && $headings) { | |
| 121 tabHeadings($summaryKey,@outputHeadings); | |
| 122 } | |
| 123 if ($csvFormat && !$summaryFormat && $headings) { | |
| 124 csvHeadings($summaryKey,@outputHeadings); | |
| 125 } | |
| 126 | |
| 127 #read records | |
| 128 while ($sdRec->readRec('LINES'=>1,'DATA'=>1)) { | |
| 129 $sdRec->addData('_REC' => ++$nRec);#add record# as temp data field | |
| 130 if ($listFormat && !$summaryFormat) { | |
| 131 print "\n\nRECORD #$nRec\n"; | |
| 132 $sdRec->writeData(); | |
| 133 } | |
| 134 if ($tableFormat && !$summaryFormat) { | |
| 135 @recList = ($sdRec); | |
| 136 tabScores(\@recList,$summaryKey,@outputFields); | |
| 137 } | |
| 138 elsif ($csvFormat && !$summaryFormat) { | |
| 139 @recList = ($sdRec); | |
| 140 csvScores(\@recList,$summaryKey,@outputFields); | |
| 141 } | |
| 142 elsif ($supplierFormat && !$summaryFormat) { | |
| 143 @recList = ($sdRec); | |
| 144 tabulateSuppliers(\@recList,$summaryKey); | |
| 145 } | |
| 146 #add record to summary, indexed by user field value | |
| 147 #keep a separate index of unique values of user field values, | |
| 148 #indexed by number in the order the values first appear | |
| 149 #In this way we can maintain the sorted order of ligands | |
| 150 #when we come to print out the summary | |
| 151 if ($summaryFormat) { | |
| 152 my $summaryValue = $sdRec->{'DATA'}->{$summaryKey}; | |
| 153 #New data field value encountered | |
| 154 if (!defined $indexByName{$summaryValue}) { | |
| 155 $idx++;#incr the number of unique values | |
| 156 #keep cross-referenced indexes (field value <-> number) | |
| 157 $indexByName{$summaryValue} = $idx; | |
| 158 $indexByNum{$idx} = $summaryValue; | |
| 159 } | |
| 160 push @{$summary{$summaryValue}},$sdRec->copy('DATA'=>1); | |
| 161 } | |
| 162 } | |
| 163 | |
| 164 #Print summary if required | |
| 165 if ($summaryFormat) { | |
| 166 print "\n===============================================================\n"; | |
| 167 print "SUMMARY BY $summaryKey\n"; | |
| 168 foreach $idx (sort {$a<=>$b} keys %indexByNum) {#numberic sort of index numbers | |
| 169 my $key = $indexByNum{$idx};#look up corresponding data field value | |
| 170 print "\n===============================================================\n"; | |
| 171 print "$summaryKey = $key (#$idx)\n\n"; | |
| 172 writeSummary($summary{$key}); | |
| 173 if ($listFormat) { | |
| 174 print "\nIndividual records:\n"; | |
| 175 foreach $rec (@{$summary{$key}}) { | |
| 176 print "\n"; | |
| 177 $rec->writeData(); | |
| 178 } | |
| 179 } | |
| 180 if ($tableFormat) { | |
| 181 print "\nScores:\n"; | |
| 182 tabHeadings($summaryKey,@outputHeadings); | |
| 183 tabScores($summary{$key},$summaryKey,@outputFields); | |
| 184 } | |
| 185 if ($csvFormat) { | |
| 186 print "\nScores:\n"; | |
| 187 csvHeadings($summaryKey,@outputHeadings); | |
| 188 csvScores($summary{$key},$summaryKey,@outputFields); | |
| 189 } | |
| 190 } | |
| 191 } | |
| 192 | |
| 193 ############################################################## | |
| 194 # writes a summary to STDOUT for a list of SDRecords | |
| 195 # Input is a reference to an array of SDRecords | |
| 196 sub writeSummary { | |
| 197 my $recListRef = shift; | |
| 198 | |
| 199 #Extract the list of data values for each fieldname into a hash array | |
| 200 #(key=fieldname, value=array ref) | |
| 201 my %fields; | |
| 202 foreach $rec (@{$recListRef}) { | |
| 203 my ($key,$value); | |
| 204 while ( ($key,$value) = each %{$rec->{'DATA'}}) { | |
| 205 push @{$fields{$key}},$value; | |
| 206 } | |
| 207 } | |
| 208 | |
| 209 #Look for constant fields and store separately | |
| 210 my %constFields; | |
| 211 foreach $key (keys %fields) { | |
| 212 my @values = sort @{$fields{$key}}; | |
| 213 my $nVal = scalar(@values); | |
| 214 if ($values[0] eq $values[$nVal -1]) { | |
| 215 $constFields{$key} = $values[0];#store the field name and the constant value | |
| 216 delete $fields{$key};#remove from (non-const) array | |
| 217 } | |
| 218 } | |
| 219 | |
| 220 #Print constant fields | |
| 221 print "\nConstant fields:\n\n"; | |
| 222 foreach $key (sort keys %constFields) { | |
| 223 printf "%-40s%s\n",$key,$constFields{$key}; | |
| 224 } | |
| 225 #Print min and max value for non-const fields | |
| 226 print "\nVariable fields:\n\n"; | |
| 227 foreach $key (sort keys %fields) { | |
| 228 my @values = @{$fields{$key}}; | |
| 229 #Look at first value to decide whether to do text or numeric sort | |
| 230 if (isNaN($values[0])) { | |
| 231 @values = sort @values;#text sort | |
| 232 } | |
| 233 else { | |
| 234 @values = sort {$a <=> $b} @values;#numeric sort | |
| 235 } | |
| 236 my $nVal = scalar(@values); | |
| 237 printf "%-40s",$key; | |
| 238 print "Min = $values[0]\tMax = $values[$nVal-1]\t(N = $nVal)\n"; | |
| 239 } | |
| 240 } | |
| 241 | |
| 242 ############################################################## | |
| 243 # function isNaN equivalent to the C++, java, javascript isNaN | |
| 244 # From P Vaglio, ~intranet/lib/rbt_func.pl | |
| 245 sub isNaN { | |
| 246 local($_) = @_; | |
| 247 s/\s+//g; # strip white space | |
| 248 # match +or- beginning of line 0 or 1 time | |
| 249 # then any numeric 0 or more | |
| 250 # then a decimal point | |
| 251 # then any numeric 0 or more after decimal point | |
| 252 # then possibly an e or E then + or - then any numreci at least once | |
| 253 if (/^([+-]?)(\d*)(\.(\d*))?([Ee]([+-]?\d+))?$/ && (defined $2 || defined $4)) { | |
| 254 return 0; | |
| 255 } else { | |
| 256 return 1; | |
| 257 } | |
| 258 } | |
| 259 | |
| 260 ############################################################## | |
| 261 # output corresponding headings for use with tabScores | |
| 262 sub tabHeadings { | |
| 263 my $summaryKey = shift; | |
| 264 my @fieldNames = @_; | |
| 265 printf("%-10s%-30s","REC",$summaryKey); | |
| 266 foreach $field (@fieldNames) { | |
| 267 printf("%10s",$field); | |
| 268 } | |
| 269 print "\n"; | |
| 270 } | |
| 271 | |
| 272 ############################################################## | |
| 273 # tab-delimited output of named data field values | |
| 274 sub tabScores { | |
| 275 my $recListRef = shift; | |
| 276 my $summaryKey = shift; | |
| 277 my @fieldNames = @_; | |
| 278 foreach $rec (@{$recListRef}) { | |
| 279 printf("%03d\t%-30.30s",$rec->{'DATA'}->{'_REC'},$rec->{'DATA'}->{$summaryKey}); | |
| 280 foreach $field (@fieldNames) { | |
| 281 my $val = $rec->{'DATA'}->{$field}; | |
| 282 if (isNaN($val)) { | |
| 283 printf("%-10.12s",$val); | |
| 284 } | |
| 285 else { | |
| 286 printf("%10.3f",$val); | |
| 287 } | |
| 288 } | |
| 289 print "\n"; | |
| 290 } | |
| 291 } | |
| 292 | |
| 293 ############################################################## | |
| 294 # output corresponding headings for use with csvScores | |
| 295 sub csvHeadings { | |
| 296 my $summaryKey = shift; | |
| 297 my @fieldNames = @_; | |
| 298 printf("%s,%s","REC",$summaryKey); | |
| 299 foreach $field (@fieldNames) { | |
| 300 printf(",%s",$field); | |
| 301 } | |
| 302 print "\n"; | |
| 303 } | |
| 304 | |
| 305 ############################################################## | |
| 306 # comma-delimited output of named data field values | |
| 307 sub csvScores { | |
| 308 my $recListRef = shift; | |
| 309 my $summaryKey = shift; | |
| 310 my @fieldNames = @_; | |
| 311 foreach $rec (@{$recListRef}) { | |
| 312 printf("%d,%s",$rec->{'DATA'}->{'_REC'},$rec->{'DATA'}->{$summaryKey}); | |
| 313 foreach $field (@fieldNames) { | |
| 314 my $val = $rec->{'DATA'}->{$field}; | |
| 315 if (isNaN($val)) { | |
| 316 printf(",%s",$val); | |
| 317 } | |
| 318 else { | |
| 319 printf(",%.3f",$val); | |
| 320 } | |
| 321 } | |
| 322 print "\n"; | |
| 323 } | |
| 324 } | |
| 325 | |
| 326 | |
| 327 ############################################################## | |
| 328 # standardised output of Catalyst supplier field | |
| 329 # Input is a reference to an array of SDRecords | |
| 330 # and a ligand identifier field to output in column 1 (def=Name) | |
| 331 sub tabulateSuppliers { | |
| 332 my $recListRef = shift; | |
| 333 my $summaryKey = shift || 'Name'; | |
| 334 foreach $rec (@{$recListRef}) { | |
| 335 my $suppBase = $rec->{'DATAREF'}->{'Supplier'}+1; | |
| 336 my $linesRef = $rec->{'LINES'}; | |
| 337 my $name = $rec->{'DATA'}->{$summaryKey}; | |
| 338 | |
| 339 #Output some useful compound info | |
| 340 my $name = $rec->{'DATA'}->{$summaryKey}; | |
| 341 my $molFormula = $rec->{'DATA'}->{'MolFormula'}; | |
| 342 my $molWt = $rec->{'DATA'}->{'MolWt'}; | |
| 343 my $casNum = $rec->{'DATA'}->{'CAS_num'}; | |
| 344 my $mdlNum = $rec->{'DATA'}->{'MDLNUMBER'}; | |
| 345 print "\n\n====================================================================================================\n"; | |
| 346 printf("%-10.10s%s\n","Name:",$name); | |
| 347 printf("%-10.10s%s\n","Formula:",$molFormula); | |
| 348 printf("%-10.10s%s\n","Mol.wt:",$molWt); | |
| 349 printf("%-10.10s%s\n","CAS #:",$casNum); | |
| 350 printf("%-10.10s%s\n","MDL #:",$mdlNum); | |
| 351 | |
| 352 #Get all the supplier record lines into a list | |
| 353 #Record is terminated by blank line | |
| 354 my @lines; | |
| 355 my $nLines = 0; | |
| 356 for (; $$linesRef[$suppBase+$nLines] ne ""; $nLines++) { | |
| 357 push @lines,$$linesRef[$suppBase+$nLines]; | |
| 358 } | |
| 359 | |
| 360 #Column headings | |
| 361 printf("\n%-20.20s%-40.40s%-40.40s\n", | |
| 362 "Supplier", | |
| 363 "Comment", | |
| 364 "Price" | |
| 365 ); | |
| 366 print "----------------------------------------------------------------------------------------------------\n"; | |
| 367 | |
| 368 #Loop over each supplier | |
| 369 my $iLine = 0; | |
| 370 for (; $iLine < $nLines; $iLine++) { | |
| 371 #collect supplier info lines | |
| 372 my @supplierInfo = (); | |
| 373 for (; $lines[$iLine] ne "." && $iLine < $nLines; $iLine++) { | |
| 374 push @supplierInfo,$lines[$iLine]; | |
| 375 } | |
| 376 #Check for incomplete record | |
| 377 if ($iLine == $nLines) { | |
| 378 print "** INCOMPLETE RECORD **\n"; | |
| 379 last; | |
| 380 } | |
| 381 my $nSupplierInfo = scalar(@supplierInfo); | |
| 382 my $supplier = $supplierInfo[0]; | |
| 383 #loop over each grade | |
| 384 for ($iLine++; ($lines[$iLine] ne "........................") && ($iLine < $nLines); $iLine++) { | |
| 385 #collect grade info lines | |
| 386 my @gradeInfo = (); | |
| 387 for (; index($lines[$iLine],"_") ne 0 && $iLine < $nLines; $iLine++) { | |
| 388 push @gradeInfo,$lines[$iLine]; | |
| 389 } | |
| 390 #Check for incomplete record | |
| 391 if ($iLine == $nLines) { | |
| 392 print "** INCOMPLETE RECORD **\n"; | |
| 393 last; | |
| 394 } | |
| 395 my $grade = $gradeInfo[0]; | |
| 396 #loop over each price info line | |
| 397 for (; index($lines[$iLine],".") ne 0 && $iLine < $nLines; $iLine++) { | |
| 398 my @priceInfo = split(" ",$lines[$iLine]); | |
| 399 my $price = join(" ",@priceInfo); | |
| 400 printf("%-20.20s%-40.39s%-40.40s\n", | |
| 401 $supplier, | |
| 402 $grade, | |
| 403 $price); | |
| 404 } | |
| 405 #Check for incomplete record | |
| 406 if ($iLine == $nLines) { | |
| 407 print "** INCOMPLETE RECORD **\n"; | |
| 408 last; | |
| 409 } | |
| 410 last if $lines[$iLine] eq "........................"; | |
| 411 } | |
| 412 } | |
| 413 } | |
| 414 } | |
| 415 | |
| 416 | |
| 417 ####################################################################### | |
| 418 sub printHelpAndExit { | |
| 419 print "\nProduces text summaries of SD records\n"; | |
| 420 print "\nUsage:\tsdreport [-l] [-t[<FieldName,FieldName...>]] [-c<FieldName,FieldName...>] [-id<IDField>] [-nh] [-o] [-s] [-sup] [sdFiles]\n"; | |
| 421 print "\n\t-l (list format) output all data fields for each record as processed\n"; | |
| 422 print "\t-t (tab format) tabulate selected fields for each record as processed\n"; | |
| 423 print "\t-c (csv format) comma delimited output of selected fields for each record as processed\n"; | |
| 424 print "\t-s (summary format) output summary statistics for each unique value of ligand ID\n"; | |
| 425 print "\t-sup (supplier format) tabulate supplier details (from Catalyst)\n"; | |
| 426 print "\t-id<IDField> data field to use as ligand ID\n"; | |
| 427 print "\t-nh don't output column headings in -t and -c formats\n"; | |
| 428 print "\t-o use old (v3.00) score field names as default columns in -t and -c formats, else use v4.00 field names\n"; | |
| 429 print "\t-norm use normalised score field names as default columns in -t and -c formats (normalised = score / #ligand heavy atoms)\n"; | |
| 430 print "\nNote:\tIf -l, -t or -c are combined with -s, the listing/table is output within each ligand summary\n"; | |
| 431 print "\t-sup should not be combined with other options\n"; | |
| 432 print "\tDefault field names for -t and -c are rDock score field names\n"; | |
| 433 print "\tDefault ID field name is Name\n"; | |
| 434 print "\n\tIf sdFiles not given, reads from standard input\n"; | |
| 435 print "\tOutput is to standard output\n\n"; | |
| 436 exit; | |
| 437 } |
