Mercurial > repos > fgiacomoni > downloader_bank_hmdb
annotate lib/hmdb_api.pm @ 2:be504ccbc41c draft default tip
master branch Updating with tag :CI_COMMIT_TAG - - Fxx
author | fgiacomoni |
---|---|
date | Wed, 30 Nov 2022 16:14:27 +0000 |
parents | 7c9269bded0e |
children |
rev | line source |
---|---|
0 | 1 package hmdb_api ; |
2 | |
3 use strict; | |
4 use warnings ; | |
5 use Exporter ; | |
6 use Carp ; | |
7 | |
8 use Data::Dumper ; | |
9 use XML::Twig ; | |
10 | |
11 use csv ; | |
12 | |
13 use vars qw($VERSION @ISA @EXPORT %EXPORT_TAGS); | |
14 | |
15 our $VERSION = "1.0"; | |
16 our @ISA = qw(Exporter); | |
17 our @EXPORT = qw( getMetaboliteFeatures cowmetdb_handle cowmetdb_hash cowmetdb_hash_to_inhouse_format buildMetabolitesArray setMetaboliteAcurrateMzToModesMz); | |
18 our %EXPORT_TAGS = ( ALL => [qw( getMetaboliteFeatures cowmetdb_handle cowmetdb_hash cowmetdb_hash_to_inhouse_format buildMetabolitesArray setMetaboliteAcurrateMzToModesMz)] ); | |
19 | |
20 =head1 NAME | |
21 | |
22 My::Module - An example module | |
23 | |
24 =head1 SYNOPSIS | |
25 | |
26 use My::Module; | |
27 my $object = My::Module->new(); | |
28 print $object->as_string; | |
29 | |
30 =head1 DESCRIPTION | |
31 | |
32 This module does not really exist, it | |
33 was made for the sole purpose of | |
34 demonstrating how POD works. | |
35 | |
36 =head1 METHODS | |
37 | |
38 Methods are : | |
39 | |
40 =head2 METHOD new | |
41 | |
42 ## Description : new | |
43 ## Input : $self | |
44 ## Ouput : bless $self ; | |
45 ## Usage : new() ; | |
46 | |
47 =cut | |
48 | |
49 sub new { | |
50 ## Variables | |
51 my $self={}; | |
52 bless($self) ; | |
53 return $self ; | |
54 } | |
55 ### END of SUB | |
56 | |
57 | |
58 =head2 METHOD cowmetdb_handle | |
59 | |
60 ## Description : open a flat file and push the contains in memory - compute entries number. | |
61 ## Input : $flat | |
62 ## Output : $handler, $entries | |
63 ## Usage : my ( $handler ) = cowmetdb_handle( $flat ) ; | |
64 | |
65 =cut | |
66 ## START of SUB | |
67 sub cowmetdb_handle { | |
68 ## Retrieve Values | |
69 my $self = shift ; | |
70 my ( $flat ) = @_ ; | |
71 | |
72 my @handle = () ; | |
73 my $entries = 0 ; | |
74 my ( $begin, $end ) = ( 0, 0 ) ; | |
75 | |
76 if ( -e $flat ) { | |
77 open(FILE, "<$flat") or die "Cant' read the file $flat\n" ; | |
78 while (my $line = <FILE>){ | |
79 chomp $line ; | |
80 push(@handle, $line) ; | |
81 if ( $line =~ /^#BEGIN_METABOCARD/ ) { $begin = 1 ; } | |
82 elsif ( ( $line =~ /^#END_METABOCARD/ ) and ( $begin == 1 ) ){ $end = 1 ; } | |
83 ## count entries | |
84 if ( ( $end == 1 ) and ( $begin == 1 ) ){ $entries++ ; ( $begin, $end ) = ( 0, 0 ) ; } | |
85 } | |
86 close(FILE) ; | |
87 } | |
88 else { | |
89 croak "Can't find the source file $flat\n" ; | |
90 } | |
91 | |
92 return(\@handle, \$entries) ; | |
93 } | |
94 ## END of SUB | |
95 | |
96 =head2 METHOD cowmetdb_hash | |
97 | |
98 ## Description : work on a hmdb flat text handler and field data (selected fields), build a hash for each found entry | |
99 ## Input : $handler | |
100 ## Output : $entries | |
101 ## Usage : my ( $entries ) = hmdb_hash( $handler ) ; | |
102 | |
103 =cut | |
104 ## START of SUB | |
105 sub cowmetdb_hash { | |
106 ## Retrieve Values | |
107 my $self = shift ; | |
108 my ( $handle ) = @_ ; | |
109 | |
110 my @entries = () ; | |
111 my %entry = () ; | |
112 my $pos = 0 ; | |
113 | |
114 if ( ( defined $handle ) ) { | |
115 foreach my $data ( @$handle ) { | |
116 | |
117 if( $data =~ /^#BEGIN_METABOCARD/ ) { %entry = () ; } | |
118 elsif( $data =~ /^#END_METABOCARD/ ) { my %temp = %entry ; push (@entries, \%temp) ; } | |
119 elsif( $data =~ /^# name:/ ) { $entry{'COMMON_NAME'} = $handle->[$pos+1] ; } | |
120 elsif( $data =~ /^# iupac:/ ) { $entry{'IUPAC'} = $handle->[$pos+1] ; } | |
121 elsif( $data =~ /^# kegg_compound_id:/ ) { $entry{'KEGG_ID'} = $handle->[$pos+1] ; } | |
122 elsif( $data =~ /^# chemical_formula:/ ) { $entry{'FORMULA'} = $handle->[$pos+1] ; } | |
123 elsif( $data =~ /^# taxonomy_super_class:/ ) { $entry{'TAXONOMY'} = $handle->[$pos+1] ; } | |
124 elsif( $data =~ /^# cas_number:/ ) { $entry{'CAS'} = $handle->[$pos+1] ; } | |
125 elsif( $data =~ /^# biofluid_location:/ ) { $entry{'LOCATION'} = $handle->[$pos+1] ; } | |
126 elsif( $data =~ /^# inchi_identifier:/ ) { $entry{'INCHI'} = $handle->[$pos+1] ; } | |
127 elsif( $data =~ /^# weight_average:/ ) { $entry{'MZ_AVERAGE'} = $handle->[$pos+1] ; } | |
128 elsif( $data =~ /^# weight_mono:/ ) { $entry{'MZ_MONO'} = $handle->[$pos+1] ; } | |
129 elsif( $data =~ /^# biocyc_id:/ ) { $entry{'BIOCYC_ID'} = $handle->[$pos+1] ; } | |
130 elsif( $data =~ /^# hmdb_id:/ ) { $entry{'HMDB_ID'} = $handle->[$pos+1] ; } | |
131 | |
132 $pos++ ; | |
133 } | |
134 } | |
135 else { | |
136 croak "Handle is not defined : parsing step impossible\n" ; | |
137 } | |
138 | |
139 return(\@entries) ; | |
140 } | |
141 ## END of SUB | |
142 | |
143 | |
144 | |
145 =head2 METHOD getMetaboliteFeatures | |
146 | |
147 ## Description : get metabolites features from a xml file | |
148 ## Input : $xmlFile, | |
149 ## Output : $metabolites | |
150 ## Usage : $metabolites = getMetaboliteFeatures($xmlFile) ; | |
151 | |
152 =cut | |
153 sub getMetaboliteFeatures { | |
154 ## Retrieve Values | |
155 my $self = shift ; | |
156 my ( $xmlFile ) = @_ ; | |
157 | |
158 my %metabolites = () ; | |
159 my $twig = undef ; | |
160 my $id = undef ; | |
161 | |
162 if (-e $xmlFile) { | |
163 | |
164 $twig = XML::Twig->nparse_ppe( | |
165 | |
166 twig_handlers => { | |
167 'metabolite/accession' => sub {$id = $_ -> text_only ; $metabolites{$id} = undef ; } , | |
168 # metabolite name | |
169 'metabolite/name' => sub { $metabolites{$id}{'metabolite_name'} = $_ -> text_only ; } , | |
170 # metabolite chemical_formula | |
171 'metabolite/chemical_formula' => sub { $metabolites{$id}{'chemical_formula'} = $_ -> text_only ; } , | |
172 # metabolite monisotopic_molecular_weight | |
173 'metabolite/monisotopic_molecular_weight' => sub { $metabolites{$id}{'monisotopic_molecular_weight'} = $_ -> text_only ; } , ## general case | |
174 'metabolite/monisotopic_moleculate_weight' => sub { $metabolites{$id}{'monisotopic_molecular_weight'} = $_ -> text_only ; } , ## | |
175 # metabolite inchikey | |
176 'metabolite/inchikey' => sub { $metabolites{$id}{'inchikey'} = $_ -> text_only ; } , | |
177 }, | |
178 pretty_print => 'indented', | |
179 error_context => 1, $xmlFile | |
180 ); | |
181 | |
182 # $twig->print; | |
183 $twig->purge ; | |
184 } | |
185 | |
186 ## get number of entries: | |
187 my $X = keys %metabolites ; | |
188 | |
189 return (\%metabolites, $X) ; | |
190 | |
191 | |
192 } | |
193 ### END of SUB | |
194 | |
195 =head2 METHOD setMetaboliteAcurrateMzToModesMz | |
196 | |
197 ## Description : set M+H and M-H masses from a metabolite (M) accurate mass | |
198 ## Input : $metabolites, $proton_mass, $electron_mass | |
199 ## Output : $mzsMetabolites | |
200 ## Usage : my ( $mzsMetabolites ) = setMetaboliteAcurrateMzToModesMz ( $metabolites, $proton_mass, $electron_mass ) ; | |
201 | |
202 =cut | |
203 ## START of SUB | |
204 sub setMetaboliteAcurrateMzToModesMz { | |
205 ## Retrieve Values | |
206 my $self = shift ; | |
207 my ( $format, $metabolites, $proton_mass, $electron_mass, $charge ) = @_; | |
208 | |
209 if ($format eq 'XML') { | |
210 foreach my $id (sort keys %{$metabolites}) { | |
211 if ( $metabolites->{$id}{'monisotopic_molecular_weight'} ) { | |
212 my $tmp_mass = $metabolites->{$id}{'monisotopic_molecular_weight'} ; | |
213 $metabolites->{$id}{'[M+H]+'} = ( $tmp_mass + $proton_mass - $electron_mass) * $charge ; | |
214 $metabolites->{$id}{'[M-H]-'} = ( $tmp_mass - $proton_mass + $electron_mass) * $charge ; | |
215 } | |
216 else { | |
2
be504ccbc41c
master branch Updating with tag :CI_COMMIT_TAG - - Fxx
fgiacomoni
parents:
0
diff
changeset
|
217 warn "No monoisotopic_molecular_weight field exists with metabolite $id - this entry is not added in dumped file\n " ; |
0 | 218 } |
219 } | |
220 } | |
221 elsif ( ($format eq 'CARD') ) { | |
222 foreach my $entry (@$metabolites) { | |
223 if ( $entry->{'MZ_MONO'} ) { | |
224 my $tmp_mass = $entry->{'MZ_MONO'} ; | |
225 $entry->{'MZ_[M+H]+'} = ( $tmp_mass + $proton_mass - $electron_mass) * $charge ; | |
226 $entry->{'MZ_[M-H]-'} = ( $tmp_mass - $proton_mass + $electron_mass) * $charge ; | |
227 } | |
228 else { | |
2
be504ccbc41c
master branch Updating with tag :CI_COMMIT_TAG - - Fxx
fgiacomoni
parents:
0
diff
changeset
|
229 warn "No MZ_MONO field exists with metabolite $entry->{'HMDB_ID'} - this entry is not added in dumped file\n " ; |
0 | 230 } |
231 } | |
232 } | |
233 | |
234 | |
235 return ($metabolites) ; | |
236 } | |
237 ### END of SUB | |
238 | |
239 =head2 METHOD buildMetabolitesArray | |
240 | |
241 ## Description : build a metabolite list from xml extraction | |
242 ## Input : $metabolites, $headers | |
243 ## Output : $metabolitesSorted | |
244 ## Usage : my ( $metabolitesSorted ) = buildMetabolitesArray ( $metabolites, $headers ) ; | |
245 | |
246 =cut | |
247 ## START of SUB | |
248 sub buildMetabolitesArray { | |
249 ## Retrieve Values | |
250 my $self = shift ; | |
251 my ( $metabolites, $headers ) = @_; | |
252 my ( @metabolitesSorted ) = ( () ) ; | |
253 | |
254 ## header format is ['HMDB_ID','MzBank', 'MetName', 'ChemFormula', 'INChIkey'] | |
255 if (defined $headers) { | |
256 push ( @metabolitesSorted, $headers ) ; | |
257 } | |
258 else { | |
259 push ( @metabolitesSorted, ['HMDB_ID','MzBank', '[M+H]+', '[M-H]-', 'MetName', 'ChemFormula', 'INChIkey'] ) ; | |
260 } | |
261 | |
262 foreach my $id (sort keys %{$metabolites}) { | |
263 my @tmp = () ; | |
264 push (@tmp, $id) ; | |
265 push (@tmp, $metabolites->{$id}{'monisotopic_molecular_weight'}) ; | |
266 push (@tmp, $metabolites->{$id}{'[M+H]+'}) ; | |
267 push (@tmp, $metabolites->{$id}{'[M-H]-'}) ; | |
268 push (@tmp, $metabolites->{$id}{'metabolite_name'}) ; | |
269 push (@tmp, $metabolites->{$id}{'chemical_formula'}) ; | |
270 push (@tmp, $metabolites->{$id}{'inchikey'}) ; | |
271 | |
272 # merge | |
2
be504ccbc41c
master branch Updating with tag :CI_COMMIT_TAG - - Fxx
fgiacomoni
parents:
0
diff
changeset
|
273 |
be504ccbc41c
master branch Updating with tag :CI_COMMIT_TAG - - Fxx
fgiacomoni
parents:
0
diff
changeset
|
274 ## filter entry if no monisotopic_molecular_weight is present |
be504ccbc41c
master branch Updating with tag :CI_COMMIT_TAG - - Fxx
fgiacomoni
parents:
0
diff
changeset
|
275 if ($metabolites->{$id}{'monisotopic_molecular_weight'} ne '') { |
be504ccbc41c
master branch Updating with tag :CI_COMMIT_TAG - - Fxx
fgiacomoni
parents:
0
diff
changeset
|
276 push (@metabolitesSorted, \@tmp) ; |
be504ccbc41c
master branch Updating with tag :CI_COMMIT_TAG - - Fxx
fgiacomoni
parents:
0
diff
changeset
|
277 } |
0 | 278 } |
279 | |
280 return (\@metabolitesSorted) ; | |
281 } | |
282 ### END of SUB | |
283 | |
284 =head2 METHOD cowmetdb_hash_to_inhouse_format | |
285 | |
286 ## Description : adaptator from hash cowmetdb entry to inhouse format | |
287 ## Input : $entries | |
288 ## Output : $tsv_handler | |
289 ## Usage : my ( $tsv_handler ) = cowmetdb_hash_to_inhouse_format( $entries ) ; | |
290 | |
291 =cut | |
292 ## START of SUB | |
293 sub cowmetdb_hash_to_inhouse_format { | |
294 ## Retrieve Values | |
295 my $self = shift ; | |
296 my ( $entries ) = @_ ; | |
297 | |
298 my @fields_name = ('HMDB_ID', 'COMMON_NAME', 'CAS', 'FORMULA', 'MZ_MONO', 'MZ_AVERAGE', 'MZ_[M+H]+', 'MZ_[M-H]-', 'KEGG_ID', 'BIOCYC_ID', 'INCHI', 'LOCATION', 'TAXONOMY', 'IUPAC') ; | |
299 my @tsv_handler = () ; | |
300 push (@tsv_handler, \@fields_name) ; ## first line | |
301 | |
302 foreach my $entry (@$entries) { | |
303 my @tmp = ( $entry->{'HMDB_ID'}, $entry->{'COMMON_NAME'}, $entry->{'CAS'}, $entry->{'FORMULA'}, $entry->{'MZ_MONO'}, $entry->{'MZ_AVERAGE'}, $entry->{'MZ_[M+H]+'}, $entry->{'MZ_[M-H]-'}, $entry->{'KEGG_ID'}, $entry->{'BIOCYC_ID'}, | |
304 $entry->{'INCHI'}, $entry->{'LOCATION'}, $entry->{'TAXONOMY'}, $entry->{'IUPAC'} ) ; | |
305 push (@tsv_handler, \@tmp) ; ## one entry by one line | |
306 } | |
307 | |
308 return(\@tsv_handler) ; | |
309 } | |
310 ## END of SUB | |
311 | |
312 | |
313 1 ; | |
314 | |
315 | |
316 __END__ | |
317 | |
318 =head1 SUPPORT | |
319 | |
320 You can find documentation for this module with the perldoc command. | |
321 | |
322 perldoc XXX.pm | |
323 | |
324 =head1 Exports | |
325 | |
326 =over 4 | |
327 | |
328 =item :ALL is ... | |
329 | |
330 =back | |
331 | |
332 =head1 AUTHOR | |
333 | |
334 Franck Giacomoni E<lt>franck.giacomoni@clermont.inra.frE<gt> | |
335 | |
336 =head1 LICENSE | |
337 | |
338 This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself. | |
339 | |
340 =head1 VERSION | |
341 | |
342 version 1 : xx / xx / 201x | |
343 | |
344 version 2 : ?? | |
345 | |
346 =cut |