0
|
1 package hmdb_api ;
|
|
2
|
|
3 use strict;
|
|
4 use warnings ;
|
|
5 use Exporter ;
|
|
6 use Carp ;
|
|
7
|
|
8 use Data::Dumper ;
|
|
9 use XML::Twig ;
|
|
10
|
|
11 use csv ;
|
|
12
|
|
13 use vars qw($VERSION @ISA @EXPORT %EXPORT_TAGS);
|
|
14
|
|
15 our $VERSION = "1.0";
|
|
16 our @ISA = qw(Exporter);
|
|
17 our @EXPORT = qw( getMetaboliteFeatures cowmetdb_handle cowmetdb_hash cowmetdb_hash_to_inhouse_format buildMetabolitesArray setMetaboliteAcurrateMzToModesMz);
|
|
18 our %EXPORT_TAGS = ( ALL => [qw( getMetaboliteFeatures cowmetdb_handle cowmetdb_hash cowmetdb_hash_to_inhouse_format buildMetabolitesArray setMetaboliteAcurrateMzToModesMz)] );
|
|
19
|
|
20 =head1 NAME
|
|
21
|
|
22 My::Module - An example module
|
|
23
|
|
24 =head1 SYNOPSIS
|
|
25
|
|
26 use My::Module;
|
|
27 my $object = My::Module->new();
|
|
28 print $object->as_string;
|
|
29
|
|
30 =head1 DESCRIPTION
|
|
31
|
|
32 This module does not really exist, it
|
|
33 was made for the sole purpose of
|
|
34 demonstrating how POD works.
|
|
35
|
|
36 =head1 METHODS
|
|
37
|
|
38 Methods are :
|
|
39
|
|
40 =head2 METHOD new
|
|
41
|
|
42 ## Description : new
|
|
43 ## Input : $self
|
|
44 ## Ouput : bless $self ;
|
|
45 ## Usage : new() ;
|
|
46
|
|
47 =cut
|
|
48
|
|
49 sub new {
|
|
50 ## Variables
|
|
51 my $self={};
|
|
52 bless($self) ;
|
|
53 return $self ;
|
|
54 }
|
|
55 ### END of SUB
|
|
56
|
|
57
|
|
58 =head2 METHOD cowmetdb_handle
|
|
59
|
|
60 ## Description : open a flat file and push the contains in memory - compute entries number.
|
|
61 ## Input : $flat
|
|
62 ## Output : $handler, $entries
|
|
63 ## Usage : my ( $handler ) = cowmetdb_handle( $flat ) ;
|
|
64
|
|
65 =cut
|
|
66 ## START of SUB
|
|
67 sub cowmetdb_handle {
|
|
68 ## Retrieve Values
|
|
69 my $self = shift ;
|
|
70 my ( $flat ) = @_ ;
|
|
71
|
|
72 my @handle = () ;
|
|
73 my $entries = 0 ;
|
|
74 my ( $begin, $end ) = ( 0, 0 ) ;
|
|
75
|
|
76 if ( -e $flat ) {
|
|
77 open(FILE, "<$flat") or die "Cant' read the file $flat\n" ;
|
|
78 while (my $line = <FILE>){
|
|
79 chomp $line ;
|
|
80 push(@handle, $line) ;
|
|
81 if ( $line =~ /^#BEGIN_METABOCARD/ ) { $begin = 1 ; }
|
|
82 elsif ( ( $line =~ /^#END_METABOCARD/ ) and ( $begin == 1 ) ){ $end = 1 ; }
|
|
83 ## count entries
|
|
84 if ( ( $end == 1 ) and ( $begin == 1 ) ){ $entries++ ; ( $begin, $end ) = ( 0, 0 ) ; }
|
|
85 }
|
|
86 close(FILE) ;
|
|
87 }
|
|
88 else {
|
|
89 croak "Can't find the source file $flat\n" ;
|
|
90 }
|
|
91
|
|
92 return(\@handle, \$entries) ;
|
|
93 }
|
|
94 ## END of SUB
|
|
95
|
|
96 =head2 METHOD cowmetdb_hash
|
|
97
|
|
98 ## Description : work on a hmdb flat text handler and field data (selected fields), build a hash for each found entry
|
|
99 ## Input : $handler
|
|
100 ## Output : $entries
|
|
101 ## Usage : my ( $entries ) = hmdb_hash( $handler ) ;
|
|
102
|
|
103 =cut
|
|
104 ## START of SUB
|
|
105 sub cowmetdb_hash {
|
|
106 ## Retrieve Values
|
|
107 my $self = shift ;
|
|
108 my ( $handle ) = @_ ;
|
|
109
|
|
110 my @entries = () ;
|
|
111 my %entry = () ;
|
|
112 my $pos = 0 ;
|
|
113
|
|
114 if ( ( defined $handle ) ) {
|
|
115 foreach my $data ( @$handle ) {
|
|
116
|
|
117 if( $data =~ /^#BEGIN_METABOCARD/ ) { %entry = () ; }
|
|
118 elsif( $data =~ /^#END_METABOCARD/ ) { my %temp = %entry ; push (@entries, \%temp) ; }
|
|
119 elsif( $data =~ /^# name:/ ) { $entry{'COMMON_NAME'} = $handle->[$pos+1] ; }
|
|
120 elsif( $data =~ /^# iupac:/ ) { $entry{'IUPAC'} = $handle->[$pos+1] ; }
|
|
121 elsif( $data =~ /^# kegg_compound_id:/ ) { $entry{'KEGG_ID'} = $handle->[$pos+1] ; }
|
|
122 elsif( $data =~ /^# chemical_formula:/ ) { $entry{'FORMULA'} = $handle->[$pos+1] ; }
|
|
123 elsif( $data =~ /^# taxonomy_super_class:/ ) { $entry{'TAXONOMY'} = $handle->[$pos+1] ; }
|
|
124 elsif( $data =~ /^# cas_number:/ ) { $entry{'CAS'} = $handle->[$pos+1] ; }
|
|
125 elsif( $data =~ /^# biofluid_location:/ ) { $entry{'LOCATION'} = $handle->[$pos+1] ; }
|
|
126 elsif( $data =~ /^# inchi_identifier:/ ) { $entry{'INCHI'} = $handle->[$pos+1] ; }
|
|
127 elsif( $data =~ /^# weight_average:/ ) { $entry{'MZ_AVERAGE'} = $handle->[$pos+1] ; }
|
|
128 elsif( $data =~ /^# weight_mono:/ ) { $entry{'MZ_MONO'} = $handle->[$pos+1] ; }
|
|
129 elsif( $data =~ /^# biocyc_id:/ ) { $entry{'BIOCYC_ID'} = $handle->[$pos+1] ; }
|
|
130 elsif( $data =~ /^# hmdb_id:/ ) { $entry{'HMDB_ID'} = $handle->[$pos+1] ; }
|
|
131
|
|
132 $pos++ ;
|
|
133 }
|
|
134 }
|
|
135 else {
|
|
136 croak "Handle is not defined : parsing step impossible\n" ;
|
|
137 }
|
|
138
|
|
139 return(\@entries) ;
|
|
140 }
|
|
141 ## END of SUB
|
|
142
|
|
143
|
|
144
|
|
145 =head2 METHOD getMetaboliteFeatures
|
|
146
|
|
147 ## Description : get metabolites features from a xml file
|
|
148 ## Input : $xmlFile,
|
|
149 ## Output : $metabolites
|
|
150 ## Usage : $metabolites = getMetaboliteFeatures($xmlFile) ;
|
|
151
|
|
152 =cut
|
|
153 sub getMetaboliteFeatures {
|
|
154 ## Retrieve Values
|
|
155 my $self = shift ;
|
|
156 my ( $xmlFile ) = @_ ;
|
|
157
|
|
158 my %metabolites = () ;
|
|
159 my $twig = undef ;
|
|
160 my $id = undef ;
|
|
161
|
|
162 if (-e $xmlFile) {
|
|
163
|
|
164 $twig = XML::Twig->nparse_ppe(
|
|
165
|
|
166 twig_handlers => {
|
|
167 'metabolite/accession' => sub {$id = $_ -> text_only ; $metabolites{$id} = undef ; } ,
|
|
168 # metabolite name
|
|
169 'metabolite/name' => sub { $metabolites{$id}{'metabolite_name'} = $_ -> text_only ; } ,
|
|
170 # metabolite chemical_formula
|
|
171 'metabolite/chemical_formula' => sub { $metabolites{$id}{'chemical_formula'} = $_ -> text_only ; } ,
|
|
172 # metabolite monisotopic_molecular_weight
|
|
173 'metabolite/monisotopic_molecular_weight' => sub { $metabolites{$id}{'monisotopic_molecular_weight'} = $_ -> text_only ; } , ## general case
|
|
174 'metabolite/monisotopic_moleculate_weight' => sub { $metabolites{$id}{'monisotopic_molecular_weight'} = $_ -> text_only ; } , ##
|
|
175 # metabolite inchikey
|
|
176 'metabolite/inchikey' => sub { $metabolites{$id}{'inchikey'} = $_ -> text_only ; } ,
|
|
177 },
|
|
178 pretty_print => 'indented',
|
|
179 error_context => 1, $xmlFile
|
|
180 );
|
|
181
|
|
182 # $twig->print;
|
|
183 $twig->purge ;
|
|
184 }
|
|
185
|
|
186 ## get number of entries:
|
|
187 my $X = keys %metabolites ;
|
|
188
|
|
189 return (\%metabolites, $X) ;
|
|
190
|
|
191
|
|
192 }
|
|
193 ### END of SUB
|
|
194
|
|
195 =head2 METHOD setMetaboliteAcurrateMzToModesMz
|
|
196
|
|
197 ## Description : set M+H and M-H masses from a metabolite (M) accurate mass
|
|
198 ## Input : $metabolites, $proton_mass, $electron_mass
|
|
199 ## Output : $mzsMetabolites
|
|
200 ## Usage : my ( $mzsMetabolites ) = setMetaboliteAcurrateMzToModesMz ( $metabolites, $proton_mass, $electron_mass ) ;
|
|
201
|
|
202 =cut
|
|
203 ## START of SUB
|
|
204 sub setMetaboliteAcurrateMzToModesMz {
|
|
205 ## Retrieve Values
|
|
206 my $self = shift ;
|
|
207 my ( $format, $metabolites, $proton_mass, $electron_mass, $charge ) = @_;
|
|
208
|
|
209 if ($format eq 'XML') {
|
|
210 foreach my $id (sort keys %{$metabolites}) {
|
|
211 if ( $metabolites->{$id}{'monisotopic_molecular_weight'} ) {
|
|
212 my $tmp_mass = $metabolites->{$id}{'monisotopic_molecular_weight'} ;
|
|
213 $metabolites->{$id}{'[M+H]+'} = ( $tmp_mass + $proton_mass - $electron_mass) * $charge ;
|
|
214 $metabolites->{$id}{'[M-H]-'} = ( $tmp_mass - $proton_mass + $electron_mass) * $charge ;
|
|
215 }
|
|
216 else {
|
|
217 warn "No monisotopic_molecular_weight field exists with metabolite $id\n " ;
|
|
218 }
|
|
219 }
|
|
220 }
|
|
221 elsif ( ($format eq 'CARD') ) {
|
|
222 foreach my $entry (@$metabolites) {
|
|
223 if ( $entry->{'MZ_MONO'} ) {
|
|
224 my $tmp_mass = $entry->{'MZ_MONO'} ;
|
|
225 $entry->{'MZ_[M+H]+'} = ( $tmp_mass + $proton_mass - $electron_mass) * $charge ;
|
|
226 $entry->{'MZ_[M-H]-'} = ( $tmp_mass - $proton_mass + $electron_mass) * $charge ;
|
|
227 }
|
|
228 else {
|
|
229 warn "No MZ_MONO field exists with metabolite $entry->{'HMDB_ID'}\n " ;
|
|
230 }
|
|
231 }
|
|
232 }
|
|
233
|
|
234
|
|
235 return ($metabolites) ;
|
|
236 }
|
|
237 ### END of SUB
|
|
238
|
|
239 =head2 METHOD buildMetabolitesArray
|
|
240
|
|
241 ## Description : build a metabolite list from xml extraction
|
|
242 ## Input : $metabolites, $headers
|
|
243 ## Output : $metabolitesSorted
|
|
244 ## Usage : my ( $metabolitesSorted ) = buildMetabolitesArray ( $metabolites, $headers ) ;
|
|
245
|
|
246 =cut
|
|
247 ## START of SUB
|
|
248 sub buildMetabolitesArray {
|
|
249 ## Retrieve Values
|
|
250 my $self = shift ;
|
|
251 my ( $metabolites, $headers ) = @_;
|
|
252 my ( @metabolitesSorted ) = ( () ) ;
|
|
253
|
|
254 ## header format is ['HMDB_ID','MzBank', 'MetName', 'ChemFormula', 'INChIkey']
|
|
255 if (defined $headers) {
|
|
256 push ( @metabolitesSorted, $headers ) ;
|
|
257 }
|
|
258 else {
|
|
259 push ( @metabolitesSorted, ['HMDB_ID','MzBank', '[M+H]+', '[M-H]-', 'MetName', 'ChemFormula', 'INChIkey'] ) ;
|
|
260 }
|
|
261
|
|
262 foreach my $id (sort keys %{$metabolites}) {
|
|
263 my @tmp = () ;
|
|
264 push (@tmp, $id) ;
|
|
265 push (@tmp, $metabolites->{$id}{'monisotopic_molecular_weight'}) ;
|
|
266 push (@tmp, $metabolites->{$id}{'[M+H]+'}) ;
|
|
267 push (@tmp, $metabolites->{$id}{'[M-H]-'}) ;
|
|
268 push (@tmp, $metabolites->{$id}{'metabolite_name'}) ;
|
|
269 push (@tmp, $metabolites->{$id}{'chemical_formula'}) ;
|
|
270 push (@tmp, $metabolites->{$id}{'inchikey'}) ;
|
|
271
|
|
272 # merge
|
|
273 push (@metabolitesSorted, \@tmp) ;
|
|
274 }
|
|
275
|
|
276 return (\@metabolitesSorted) ;
|
|
277 }
|
|
278 ### END of SUB
|
|
279
|
|
280 =head2 METHOD cowmetdb_hash_to_inhouse_format
|
|
281
|
|
282 ## Description : adaptator from hash cowmetdb entry to inhouse format
|
|
283 ## Input : $entries
|
|
284 ## Output : $tsv_handler
|
|
285 ## Usage : my ( $tsv_handler ) = cowmetdb_hash_to_inhouse_format( $entries ) ;
|
|
286
|
|
287 =cut
|
|
288 ## START of SUB
|
|
289 sub cowmetdb_hash_to_inhouse_format {
|
|
290 ## Retrieve Values
|
|
291 my $self = shift ;
|
|
292 my ( $entries ) = @_ ;
|
|
293
|
|
294 my @fields_name = ('HMDB_ID', 'COMMON_NAME', 'CAS', 'FORMULA', 'MZ_MONO', 'MZ_AVERAGE', 'MZ_[M+H]+', 'MZ_[M-H]-', 'KEGG_ID', 'BIOCYC_ID', 'INCHI', 'LOCATION', 'TAXONOMY', 'IUPAC') ;
|
|
295 my @tsv_handler = () ;
|
|
296 push (@tsv_handler, \@fields_name) ; ## first line
|
|
297
|
|
298 foreach my $entry (@$entries) {
|
|
299 my @tmp = ( $entry->{'HMDB_ID'}, $entry->{'COMMON_NAME'}, $entry->{'CAS'}, $entry->{'FORMULA'}, $entry->{'MZ_MONO'}, $entry->{'MZ_AVERAGE'}, $entry->{'MZ_[M+H]+'}, $entry->{'MZ_[M-H]-'}, $entry->{'KEGG_ID'}, $entry->{'BIOCYC_ID'},
|
|
300 $entry->{'INCHI'}, $entry->{'LOCATION'}, $entry->{'TAXONOMY'}, $entry->{'IUPAC'} ) ;
|
|
301 push (@tsv_handler, \@tmp) ; ## one entry by one line
|
|
302 }
|
|
303
|
|
304 return(\@tsv_handler) ;
|
|
305 }
|
|
306 ## END of SUB
|
|
307
|
|
308
|
|
309 1 ;
|
|
310
|
|
311
|
|
312 __END__
|
|
313
|
|
314 =head1 SUPPORT
|
|
315
|
|
316 You can find documentation for this module with the perldoc command.
|
|
317
|
|
318 perldoc XXX.pm
|
|
319
|
|
320 =head1 Exports
|
|
321
|
|
322 =over 4
|
|
323
|
|
324 =item :ALL is ...
|
|
325
|
|
326 =back
|
|
327
|
|
328 =head1 AUTHOR
|
|
329
|
|
330 Franck Giacomoni E<lt>franck.giacomoni@clermont.inra.frE<gt>
|
|
331
|
|
332 =head1 LICENSE
|
|
333
|
|
334 This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself.
|
|
335
|
|
336 =head1 VERSION
|
|
337
|
|
338 version 1 : xx / xx / 201x
|
|
339
|
|
340 version 2 : ??
|
|
341
|
|
342 =cut |