Mercurial > repos > fgiacomoni > downloader_bank_hmdb
annotate lib/hmdb_api.pm @ 2:be504ccbc41c draft default tip
master branch Updating with tag :CI_COMMIT_TAG - - Fxx
| author | fgiacomoni |
|---|---|
| date | Wed, 30 Nov 2022 16:14:27 +0000 |
| parents | 7c9269bded0e |
| children |
| rev | line source |
|---|---|
| 0 | 1 package hmdb_api ; |
| 2 | |
| 3 use strict; | |
| 4 use warnings ; | |
| 5 use Exporter ; | |
| 6 use Carp ; | |
| 7 | |
| 8 use Data::Dumper ; | |
| 9 use XML::Twig ; | |
| 10 | |
| 11 use csv ; | |
| 12 | |
| 13 use vars qw($VERSION @ISA @EXPORT %EXPORT_TAGS); | |
| 14 | |
| 15 our $VERSION = "1.0"; | |
| 16 our @ISA = qw(Exporter); | |
| 17 our @EXPORT = qw( getMetaboliteFeatures cowmetdb_handle cowmetdb_hash cowmetdb_hash_to_inhouse_format buildMetabolitesArray setMetaboliteAcurrateMzToModesMz); | |
| 18 our %EXPORT_TAGS = ( ALL => [qw( getMetaboliteFeatures cowmetdb_handle cowmetdb_hash cowmetdb_hash_to_inhouse_format buildMetabolitesArray setMetaboliteAcurrateMzToModesMz)] ); | |
| 19 | |
| 20 =head1 NAME | |
| 21 | |
| 22 My::Module - An example module | |
| 23 | |
| 24 =head1 SYNOPSIS | |
| 25 | |
| 26 use My::Module; | |
| 27 my $object = My::Module->new(); | |
| 28 print $object->as_string; | |
| 29 | |
| 30 =head1 DESCRIPTION | |
| 31 | |
| 32 This module does not really exist, it | |
| 33 was made for the sole purpose of | |
| 34 demonstrating how POD works. | |
| 35 | |
| 36 =head1 METHODS | |
| 37 | |
| 38 Methods are : | |
| 39 | |
| 40 =head2 METHOD new | |
| 41 | |
| 42 ## Description : new | |
| 43 ## Input : $self | |
| 44 ## Ouput : bless $self ; | |
| 45 ## Usage : new() ; | |
| 46 | |
| 47 =cut | |
| 48 | |
| 49 sub new { | |
| 50 ## Variables | |
| 51 my $self={}; | |
| 52 bless($self) ; | |
| 53 return $self ; | |
| 54 } | |
| 55 ### END of SUB | |
| 56 | |
| 57 | |
| 58 =head2 METHOD cowmetdb_handle | |
| 59 | |
| 60 ## Description : open a flat file and push the contains in memory - compute entries number. | |
| 61 ## Input : $flat | |
| 62 ## Output : $handler, $entries | |
| 63 ## Usage : my ( $handler ) = cowmetdb_handle( $flat ) ; | |
| 64 | |
| 65 =cut | |
| 66 ## START of SUB | |
| 67 sub cowmetdb_handle { | |
| 68 ## Retrieve Values | |
| 69 my $self = shift ; | |
| 70 my ( $flat ) = @_ ; | |
| 71 | |
| 72 my @handle = () ; | |
| 73 my $entries = 0 ; | |
| 74 my ( $begin, $end ) = ( 0, 0 ) ; | |
| 75 | |
| 76 if ( -e $flat ) { | |
| 77 open(FILE, "<$flat") or die "Cant' read the file $flat\n" ; | |
| 78 while (my $line = <FILE>){ | |
| 79 chomp $line ; | |
| 80 push(@handle, $line) ; | |
| 81 if ( $line =~ /^#BEGIN_METABOCARD/ ) { $begin = 1 ; } | |
| 82 elsif ( ( $line =~ /^#END_METABOCARD/ ) and ( $begin == 1 ) ){ $end = 1 ; } | |
| 83 ## count entries | |
| 84 if ( ( $end == 1 ) and ( $begin == 1 ) ){ $entries++ ; ( $begin, $end ) = ( 0, 0 ) ; } | |
| 85 } | |
| 86 close(FILE) ; | |
| 87 } | |
| 88 else { | |
| 89 croak "Can't find the source file $flat\n" ; | |
| 90 } | |
| 91 | |
| 92 return(\@handle, \$entries) ; | |
| 93 } | |
| 94 ## END of SUB | |
| 95 | |
| 96 =head2 METHOD cowmetdb_hash | |
| 97 | |
| 98 ## Description : work on a hmdb flat text handler and field data (selected fields), build a hash for each found entry | |
| 99 ## Input : $handler | |
| 100 ## Output : $entries | |
| 101 ## Usage : my ( $entries ) = hmdb_hash( $handler ) ; | |
| 102 | |
| 103 =cut | |
| 104 ## START of SUB | |
| 105 sub cowmetdb_hash { | |
| 106 ## Retrieve Values | |
| 107 my $self = shift ; | |
| 108 my ( $handle ) = @_ ; | |
| 109 | |
| 110 my @entries = () ; | |
| 111 my %entry = () ; | |
| 112 my $pos = 0 ; | |
| 113 | |
| 114 if ( ( defined $handle ) ) { | |
| 115 foreach my $data ( @$handle ) { | |
| 116 | |
| 117 if( $data =~ /^#BEGIN_METABOCARD/ ) { %entry = () ; } | |
| 118 elsif( $data =~ /^#END_METABOCARD/ ) { my %temp = %entry ; push (@entries, \%temp) ; } | |
| 119 elsif( $data =~ /^# name:/ ) { $entry{'COMMON_NAME'} = $handle->[$pos+1] ; } | |
| 120 elsif( $data =~ /^# iupac:/ ) { $entry{'IUPAC'} = $handle->[$pos+1] ; } | |
| 121 elsif( $data =~ /^# kegg_compound_id:/ ) { $entry{'KEGG_ID'} = $handle->[$pos+1] ; } | |
| 122 elsif( $data =~ /^# chemical_formula:/ ) { $entry{'FORMULA'} = $handle->[$pos+1] ; } | |
| 123 elsif( $data =~ /^# taxonomy_super_class:/ ) { $entry{'TAXONOMY'} = $handle->[$pos+1] ; } | |
| 124 elsif( $data =~ /^# cas_number:/ ) { $entry{'CAS'} = $handle->[$pos+1] ; } | |
| 125 elsif( $data =~ /^# biofluid_location:/ ) { $entry{'LOCATION'} = $handle->[$pos+1] ; } | |
| 126 elsif( $data =~ /^# inchi_identifier:/ ) { $entry{'INCHI'} = $handle->[$pos+1] ; } | |
| 127 elsif( $data =~ /^# weight_average:/ ) { $entry{'MZ_AVERAGE'} = $handle->[$pos+1] ; } | |
| 128 elsif( $data =~ /^# weight_mono:/ ) { $entry{'MZ_MONO'} = $handle->[$pos+1] ; } | |
| 129 elsif( $data =~ /^# biocyc_id:/ ) { $entry{'BIOCYC_ID'} = $handle->[$pos+1] ; } | |
| 130 elsif( $data =~ /^# hmdb_id:/ ) { $entry{'HMDB_ID'} = $handle->[$pos+1] ; } | |
| 131 | |
| 132 $pos++ ; | |
| 133 } | |
| 134 } | |
| 135 else { | |
| 136 croak "Handle is not defined : parsing step impossible\n" ; | |
| 137 } | |
| 138 | |
| 139 return(\@entries) ; | |
| 140 } | |
| 141 ## END of SUB | |
| 142 | |
| 143 | |
| 144 | |
| 145 =head2 METHOD getMetaboliteFeatures | |
| 146 | |
| 147 ## Description : get metabolites features from a xml file | |
| 148 ## Input : $xmlFile, | |
| 149 ## Output : $metabolites | |
| 150 ## Usage : $metabolites = getMetaboliteFeatures($xmlFile) ; | |
| 151 | |
| 152 =cut | |
| 153 sub getMetaboliteFeatures { | |
| 154 ## Retrieve Values | |
| 155 my $self = shift ; | |
| 156 my ( $xmlFile ) = @_ ; | |
| 157 | |
| 158 my %metabolites = () ; | |
| 159 my $twig = undef ; | |
| 160 my $id = undef ; | |
| 161 | |
| 162 if (-e $xmlFile) { | |
| 163 | |
| 164 $twig = XML::Twig->nparse_ppe( | |
| 165 | |
| 166 twig_handlers => { | |
| 167 'metabolite/accession' => sub {$id = $_ -> text_only ; $metabolites{$id} = undef ; } , | |
| 168 # metabolite name | |
| 169 'metabolite/name' => sub { $metabolites{$id}{'metabolite_name'} = $_ -> text_only ; } , | |
| 170 # metabolite chemical_formula | |
| 171 'metabolite/chemical_formula' => sub { $metabolites{$id}{'chemical_formula'} = $_ -> text_only ; } , | |
| 172 # metabolite monisotopic_molecular_weight | |
| 173 'metabolite/monisotopic_molecular_weight' => sub { $metabolites{$id}{'monisotopic_molecular_weight'} = $_ -> text_only ; } , ## general case | |
| 174 'metabolite/monisotopic_moleculate_weight' => sub { $metabolites{$id}{'monisotopic_molecular_weight'} = $_ -> text_only ; } , ## | |
| 175 # metabolite inchikey | |
| 176 'metabolite/inchikey' => sub { $metabolites{$id}{'inchikey'} = $_ -> text_only ; } , | |
| 177 }, | |
| 178 pretty_print => 'indented', | |
| 179 error_context => 1, $xmlFile | |
| 180 ); | |
| 181 | |
| 182 # $twig->print; | |
| 183 $twig->purge ; | |
| 184 } | |
| 185 | |
| 186 ## get number of entries: | |
| 187 my $X = keys %metabolites ; | |
| 188 | |
| 189 return (\%metabolites, $X) ; | |
| 190 | |
| 191 | |
| 192 } | |
| 193 ### END of SUB | |
| 194 | |
| 195 =head2 METHOD setMetaboliteAcurrateMzToModesMz | |
| 196 | |
| 197 ## Description : set M+H and M-H masses from a metabolite (M) accurate mass | |
| 198 ## Input : $metabolites, $proton_mass, $electron_mass | |
| 199 ## Output : $mzsMetabolites | |
| 200 ## Usage : my ( $mzsMetabolites ) = setMetaboliteAcurrateMzToModesMz ( $metabolites, $proton_mass, $electron_mass ) ; | |
| 201 | |
| 202 =cut | |
| 203 ## START of SUB | |
| 204 sub setMetaboliteAcurrateMzToModesMz { | |
| 205 ## Retrieve Values | |
| 206 my $self = shift ; | |
| 207 my ( $format, $metabolites, $proton_mass, $electron_mass, $charge ) = @_; | |
| 208 | |
| 209 if ($format eq 'XML') { | |
| 210 foreach my $id (sort keys %{$metabolites}) { | |
| 211 if ( $metabolites->{$id}{'monisotopic_molecular_weight'} ) { | |
| 212 my $tmp_mass = $metabolites->{$id}{'monisotopic_molecular_weight'} ; | |
| 213 $metabolites->{$id}{'[M+H]+'} = ( $tmp_mass + $proton_mass - $electron_mass) * $charge ; | |
| 214 $metabolites->{$id}{'[M-H]-'} = ( $tmp_mass - $proton_mass + $electron_mass) * $charge ; | |
| 215 } | |
| 216 else { | |
|
2
be504ccbc41c
master branch Updating with tag :CI_COMMIT_TAG - - Fxx
fgiacomoni
parents:
0
diff
changeset
|
217 warn "No monoisotopic_molecular_weight field exists with metabolite $id - this entry is not added in dumped file\n " ; |
| 0 | 218 } |
| 219 } | |
| 220 } | |
| 221 elsif ( ($format eq 'CARD') ) { | |
| 222 foreach my $entry (@$metabolites) { | |
| 223 if ( $entry->{'MZ_MONO'} ) { | |
| 224 my $tmp_mass = $entry->{'MZ_MONO'} ; | |
| 225 $entry->{'MZ_[M+H]+'} = ( $tmp_mass + $proton_mass - $electron_mass) * $charge ; | |
| 226 $entry->{'MZ_[M-H]-'} = ( $tmp_mass - $proton_mass + $electron_mass) * $charge ; | |
| 227 } | |
| 228 else { | |
|
2
be504ccbc41c
master branch Updating with tag :CI_COMMIT_TAG - - Fxx
fgiacomoni
parents:
0
diff
changeset
|
229 warn "No MZ_MONO field exists with metabolite $entry->{'HMDB_ID'} - this entry is not added in dumped file\n " ; |
| 0 | 230 } |
| 231 } | |
| 232 } | |
| 233 | |
| 234 | |
| 235 return ($metabolites) ; | |
| 236 } | |
| 237 ### END of SUB | |
| 238 | |
| 239 =head2 METHOD buildMetabolitesArray | |
| 240 | |
| 241 ## Description : build a metabolite list from xml extraction | |
| 242 ## Input : $metabolites, $headers | |
| 243 ## Output : $metabolitesSorted | |
| 244 ## Usage : my ( $metabolitesSorted ) = buildMetabolitesArray ( $metabolites, $headers ) ; | |
| 245 | |
| 246 =cut | |
| 247 ## START of SUB | |
| 248 sub buildMetabolitesArray { | |
| 249 ## Retrieve Values | |
| 250 my $self = shift ; | |
| 251 my ( $metabolites, $headers ) = @_; | |
| 252 my ( @metabolitesSorted ) = ( () ) ; | |
| 253 | |
| 254 ## header format is ['HMDB_ID','MzBank', 'MetName', 'ChemFormula', 'INChIkey'] | |
| 255 if (defined $headers) { | |
| 256 push ( @metabolitesSorted, $headers ) ; | |
| 257 } | |
| 258 else { | |
| 259 push ( @metabolitesSorted, ['HMDB_ID','MzBank', '[M+H]+', '[M-H]-', 'MetName', 'ChemFormula', 'INChIkey'] ) ; | |
| 260 } | |
| 261 | |
| 262 foreach my $id (sort keys %{$metabolites}) { | |
| 263 my @tmp = () ; | |
| 264 push (@tmp, $id) ; | |
| 265 push (@tmp, $metabolites->{$id}{'monisotopic_molecular_weight'}) ; | |
| 266 push (@tmp, $metabolites->{$id}{'[M+H]+'}) ; | |
| 267 push (@tmp, $metabolites->{$id}{'[M-H]-'}) ; | |
| 268 push (@tmp, $metabolites->{$id}{'metabolite_name'}) ; | |
| 269 push (@tmp, $metabolites->{$id}{'chemical_formula'}) ; | |
| 270 push (@tmp, $metabolites->{$id}{'inchikey'}) ; | |
| 271 | |
| 272 # merge | |
|
2
be504ccbc41c
master branch Updating with tag :CI_COMMIT_TAG - - Fxx
fgiacomoni
parents:
0
diff
changeset
|
273 |
|
be504ccbc41c
master branch Updating with tag :CI_COMMIT_TAG - - Fxx
fgiacomoni
parents:
0
diff
changeset
|
274 ## filter entry if no monisotopic_molecular_weight is present |
|
be504ccbc41c
master branch Updating with tag :CI_COMMIT_TAG - - Fxx
fgiacomoni
parents:
0
diff
changeset
|
275 if ($metabolites->{$id}{'monisotopic_molecular_weight'} ne '') { |
|
be504ccbc41c
master branch Updating with tag :CI_COMMIT_TAG - - Fxx
fgiacomoni
parents:
0
diff
changeset
|
276 push (@metabolitesSorted, \@tmp) ; |
|
be504ccbc41c
master branch Updating with tag :CI_COMMIT_TAG - - Fxx
fgiacomoni
parents:
0
diff
changeset
|
277 } |
| 0 | 278 } |
| 279 | |
| 280 return (\@metabolitesSorted) ; | |
| 281 } | |
| 282 ### END of SUB | |
| 283 | |
| 284 =head2 METHOD cowmetdb_hash_to_inhouse_format | |
| 285 | |
| 286 ## Description : adaptator from hash cowmetdb entry to inhouse format | |
| 287 ## Input : $entries | |
| 288 ## Output : $tsv_handler | |
| 289 ## Usage : my ( $tsv_handler ) = cowmetdb_hash_to_inhouse_format( $entries ) ; | |
| 290 | |
| 291 =cut | |
| 292 ## START of SUB | |
| 293 sub cowmetdb_hash_to_inhouse_format { | |
| 294 ## Retrieve Values | |
| 295 my $self = shift ; | |
| 296 my ( $entries ) = @_ ; | |
| 297 | |
| 298 my @fields_name = ('HMDB_ID', 'COMMON_NAME', 'CAS', 'FORMULA', 'MZ_MONO', 'MZ_AVERAGE', 'MZ_[M+H]+', 'MZ_[M-H]-', 'KEGG_ID', 'BIOCYC_ID', 'INCHI', 'LOCATION', 'TAXONOMY', 'IUPAC') ; | |
| 299 my @tsv_handler = () ; | |
| 300 push (@tsv_handler, \@fields_name) ; ## first line | |
| 301 | |
| 302 foreach my $entry (@$entries) { | |
| 303 my @tmp = ( $entry->{'HMDB_ID'}, $entry->{'COMMON_NAME'}, $entry->{'CAS'}, $entry->{'FORMULA'}, $entry->{'MZ_MONO'}, $entry->{'MZ_AVERAGE'}, $entry->{'MZ_[M+H]+'}, $entry->{'MZ_[M-H]-'}, $entry->{'KEGG_ID'}, $entry->{'BIOCYC_ID'}, | |
| 304 $entry->{'INCHI'}, $entry->{'LOCATION'}, $entry->{'TAXONOMY'}, $entry->{'IUPAC'} ) ; | |
| 305 push (@tsv_handler, \@tmp) ; ## one entry by one line | |
| 306 } | |
| 307 | |
| 308 return(\@tsv_handler) ; | |
| 309 } | |
| 310 ## END of SUB | |
| 311 | |
| 312 | |
| 313 1 ; | |
| 314 | |
| 315 | |
| 316 __END__ | |
| 317 | |
| 318 =head1 SUPPORT | |
| 319 | |
| 320 You can find documentation for this module with the perldoc command. | |
| 321 | |
| 322 perldoc XXX.pm | |
| 323 | |
| 324 =head1 Exports | |
| 325 | |
| 326 =over 4 | |
| 327 | |
| 328 =item :ALL is ... | |
| 329 | |
| 330 =back | |
| 331 | |
| 332 =head1 AUTHOR | |
| 333 | |
| 334 Franck Giacomoni E<lt>franck.giacomoni@clermont.inra.frE<gt> | |
| 335 | |
| 336 =head1 LICENSE | |
| 337 | |
| 338 This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself. | |
| 339 | |
| 340 =head1 VERSION | |
| 341 | |
| 342 version 1 : xx / xx / 201x | |
| 343 | |
| 344 version 2 : ?? | |
| 345 | |
| 346 =cut |
