comparison lib/hmdb_api.pm @ 0:7c9269bded0e draft

Init repository for [downloader_bank_hmdb]
author fgiacomoni
date Tue, 14 Jan 2020 05:21:23 -0500
parents
children be504ccbc41c
comparison
equal deleted inserted replaced
-1:000000000000 0:7c9269bded0e
1 package hmdb_api ;
2
3 use strict;
4 use warnings ;
5 use Exporter ;
6 use Carp ;
7
8 use Data::Dumper ;
9 use XML::Twig ;
10
11 use csv ;
12
13 use vars qw($VERSION @ISA @EXPORT %EXPORT_TAGS);
14
15 our $VERSION = "1.0";
16 our @ISA = qw(Exporter);
17 our @EXPORT = qw( getMetaboliteFeatures cowmetdb_handle cowmetdb_hash cowmetdb_hash_to_inhouse_format buildMetabolitesArray setMetaboliteAcurrateMzToModesMz);
18 our %EXPORT_TAGS = ( ALL => [qw( getMetaboliteFeatures cowmetdb_handle cowmetdb_hash cowmetdb_hash_to_inhouse_format buildMetabolitesArray setMetaboliteAcurrateMzToModesMz)] );
19
20 =head1 NAME
21
22 My::Module - An example module
23
24 =head1 SYNOPSIS
25
26 use My::Module;
27 my $object = My::Module->new();
28 print $object->as_string;
29
30 =head1 DESCRIPTION
31
32 This module does not really exist, it
33 was made for the sole purpose of
34 demonstrating how POD works.
35
36 =head1 METHODS
37
38 Methods are :
39
40 =head2 METHOD new
41
42 ## Description : new
43 ## Input : $self
44 ## Ouput : bless $self ;
45 ## Usage : new() ;
46
47 =cut
48
49 sub new {
50 ## Variables
51 my $self={};
52 bless($self) ;
53 return $self ;
54 }
55 ### END of SUB
56
57
58 =head2 METHOD cowmetdb_handle
59
60 ## Description : open a flat file and push the contains in memory - compute entries number.
61 ## Input : $flat
62 ## Output : $handler, $entries
63 ## Usage : my ( $handler ) = cowmetdb_handle( $flat ) ;
64
65 =cut
66 ## START of SUB
67 sub cowmetdb_handle {
68 ## Retrieve Values
69 my $self = shift ;
70 my ( $flat ) = @_ ;
71
72 my @handle = () ;
73 my $entries = 0 ;
74 my ( $begin, $end ) = ( 0, 0 ) ;
75
76 if ( -e $flat ) {
77 open(FILE, "<$flat") or die "Cant' read the file $flat\n" ;
78 while (my $line = <FILE>){
79 chomp $line ;
80 push(@handle, $line) ;
81 if ( $line =~ /^#BEGIN_METABOCARD/ ) { $begin = 1 ; }
82 elsif ( ( $line =~ /^#END_METABOCARD/ ) and ( $begin == 1 ) ){ $end = 1 ; }
83 ## count entries
84 if ( ( $end == 1 ) and ( $begin == 1 ) ){ $entries++ ; ( $begin, $end ) = ( 0, 0 ) ; }
85 }
86 close(FILE) ;
87 }
88 else {
89 croak "Can't find the source file $flat\n" ;
90 }
91
92 return(\@handle, \$entries) ;
93 }
94 ## END of SUB
95
96 =head2 METHOD cowmetdb_hash
97
98 ## Description : work on a hmdb flat text handler and field data (selected fields), build a hash for each found entry
99 ## Input : $handler
100 ## Output : $entries
101 ## Usage : my ( $entries ) = hmdb_hash( $handler ) ;
102
103 =cut
104 ## START of SUB
105 sub cowmetdb_hash {
106 ## Retrieve Values
107 my $self = shift ;
108 my ( $handle ) = @_ ;
109
110 my @entries = () ;
111 my %entry = () ;
112 my $pos = 0 ;
113
114 if ( ( defined $handle ) ) {
115 foreach my $data ( @$handle ) {
116
117 if( $data =~ /^#BEGIN_METABOCARD/ ) { %entry = () ; }
118 elsif( $data =~ /^#END_METABOCARD/ ) { my %temp = %entry ; push (@entries, \%temp) ; }
119 elsif( $data =~ /^# name:/ ) { $entry{'COMMON_NAME'} = $handle->[$pos+1] ; }
120 elsif( $data =~ /^# iupac:/ ) { $entry{'IUPAC'} = $handle->[$pos+1] ; }
121 elsif( $data =~ /^# kegg_compound_id:/ ) { $entry{'KEGG_ID'} = $handle->[$pos+1] ; }
122 elsif( $data =~ /^# chemical_formula:/ ) { $entry{'FORMULA'} = $handle->[$pos+1] ; }
123 elsif( $data =~ /^# taxonomy_super_class:/ ) { $entry{'TAXONOMY'} = $handle->[$pos+1] ; }
124 elsif( $data =~ /^# cas_number:/ ) { $entry{'CAS'} = $handle->[$pos+1] ; }
125 elsif( $data =~ /^# biofluid_location:/ ) { $entry{'LOCATION'} = $handle->[$pos+1] ; }
126 elsif( $data =~ /^# inchi_identifier:/ ) { $entry{'INCHI'} = $handle->[$pos+1] ; }
127 elsif( $data =~ /^# weight_average:/ ) { $entry{'MZ_AVERAGE'} = $handle->[$pos+1] ; }
128 elsif( $data =~ /^# weight_mono:/ ) { $entry{'MZ_MONO'} = $handle->[$pos+1] ; }
129 elsif( $data =~ /^# biocyc_id:/ ) { $entry{'BIOCYC_ID'} = $handle->[$pos+1] ; }
130 elsif( $data =~ /^# hmdb_id:/ ) { $entry{'HMDB_ID'} = $handle->[$pos+1] ; }
131
132 $pos++ ;
133 }
134 }
135 else {
136 croak "Handle is not defined : parsing step impossible\n" ;
137 }
138
139 return(\@entries) ;
140 }
141 ## END of SUB
142
143
144
145 =head2 METHOD getMetaboliteFeatures
146
147 ## Description : get metabolites features from a xml file
148 ## Input : $xmlFile,
149 ## Output : $metabolites
150 ## Usage : $metabolites = getMetaboliteFeatures($xmlFile) ;
151
152 =cut
153 sub getMetaboliteFeatures {
154 ## Retrieve Values
155 my $self = shift ;
156 my ( $xmlFile ) = @_ ;
157
158 my %metabolites = () ;
159 my $twig = undef ;
160 my $id = undef ;
161
162 if (-e $xmlFile) {
163
164 $twig = XML::Twig->nparse_ppe(
165
166 twig_handlers => {
167 'metabolite/accession' => sub {$id = $_ -> text_only ; $metabolites{$id} = undef ; } ,
168 # metabolite name
169 'metabolite/name' => sub { $metabolites{$id}{'metabolite_name'} = $_ -> text_only ; } ,
170 # metabolite chemical_formula
171 'metabolite/chemical_formula' => sub { $metabolites{$id}{'chemical_formula'} = $_ -> text_only ; } ,
172 # metabolite monisotopic_molecular_weight
173 'metabolite/monisotopic_molecular_weight' => sub { $metabolites{$id}{'monisotopic_molecular_weight'} = $_ -> text_only ; } , ## general case
174 'metabolite/monisotopic_moleculate_weight' => sub { $metabolites{$id}{'monisotopic_molecular_weight'} = $_ -> text_only ; } , ##
175 # metabolite inchikey
176 'metabolite/inchikey' => sub { $metabolites{$id}{'inchikey'} = $_ -> text_only ; } ,
177 },
178 pretty_print => 'indented',
179 error_context => 1, $xmlFile
180 );
181
182 # $twig->print;
183 $twig->purge ;
184 }
185
186 ## get number of entries:
187 my $X = keys %metabolites ;
188
189 return (\%metabolites, $X) ;
190
191
192 }
193 ### END of SUB
194
195 =head2 METHOD setMetaboliteAcurrateMzToModesMz
196
197 ## Description : set M+H and M-H masses from a metabolite (M) accurate mass
198 ## Input : $metabolites, $proton_mass, $electron_mass
199 ## Output : $mzsMetabolites
200 ## Usage : my ( $mzsMetabolites ) = setMetaboliteAcurrateMzToModesMz ( $metabolites, $proton_mass, $electron_mass ) ;
201
202 =cut
203 ## START of SUB
204 sub setMetaboliteAcurrateMzToModesMz {
205 ## Retrieve Values
206 my $self = shift ;
207 my ( $format, $metabolites, $proton_mass, $electron_mass, $charge ) = @_;
208
209 if ($format eq 'XML') {
210 foreach my $id (sort keys %{$metabolites}) {
211 if ( $metabolites->{$id}{'monisotopic_molecular_weight'} ) {
212 my $tmp_mass = $metabolites->{$id}{'monisotopic_molecular_weight'} ;
213 $metabolites->{$id}{'[M+H]+'} = ( $tmp_mass + $proton_mass - $electron_mass) * $charge ;
214 $metabolites->{$id}{'[M-H]-'} = ( $tmp_mass - $proton_mass + $electron_mass) * $charge ;
215 }
216 else {
217 warn "No monisotopic_molecular_weight field exists with metabolite $id\n " ;
218 }
219 }
220 }
221 elsif ( ($format eq 'CARD') ) {
222 foreach my $entry (@$metabolites) {
223 if ( $entry->{'MZ_MONO'} ) {
224 my $tmp_mass = $entry->{'MZ_MONO'} ;
225 $entry->{'MZ_[M+H]+'} = ( $tmp_mass + $proton_mass - $electron_mass) * $charge ;
226 $entry->{'MZ_[M-H]-'} = ( $tmp_mass - $proton_mass + $electron_mass) * $charge ;
227 }
228 else {
229 warn "No MZ_MONO field exists with metabolite $entry->{'HMDB_ID'}\n " ;
230 }
231 }
232 }
233
234
235 return ($metabolites) ;
236 }
237 ### END of SUB
238
239 =head2 METHOD buildMetabolitesArray
240
241 ## Description : build a metabolite list from xml extraction
242 ## Input : $metabolites, $headers
243 ## Output : $metabolitesSorted
244 ## Usage : my ( $metabolitesSorted ) = buildMetabolitesArray ( $metabolites, $headers ) ;
245
246 =cut
247 ## START of SUB
248 sub buildMetabolitesArray {
249 ## Retrieve Values
250 my $self = shift ;
251 my ( $metabolites, $headers ) = @_;
252 my ( @metabolitesSorted ) = ( () ) ;
253
254 ## header format is ['HMDB_ID','MzBank', 'MetName', 'ChemFormula', 'INChIkey']
255 if (defined $headers) {
256 push ( @metabolitesSorted, $headers ) ;
257 }
258 else {
259 push ( @metabolitesSorted, ['HMDB_ID','MzBank', '[M+H]+', '[M-H]-', 'MetName', 'ChemFormula', 'INChIkey'] ) ;
260 }
261
262 foreach my $id (sort keys %{$metabolites}) {
263 my @tmp = () ;
264 push (@tmp, $id) ;
265 push (@tmp, $metabolites->{$id}{'monisotopic_molecular_weight'}) ;
266 push (@tmp, $metabolites->{$id}{'[M+H]+'}) ;
267 push (@tmp, $metabolites->{$id}{'[M-H]-'}) ;
268 push (@tmp, $metabolites->{$id}{'metabolite_name'}) ;
269 push (@tmp, $metabolites->{$id}{'chemical_formula'}) ;
270 push (@tmp, $metabolites->{$id}{'inchikey'}) ;
271
272 # merge
273 push (@metabolitesSorted, \@tmp) ;
274 }
275
276 return (\@metabolitesSorted) ;
277 }
278 ### END of SUB
279
280 =head2 METHOD cowmetdb_hash_to_inhouse_format
281
282 ## Description : adaptator from hash cowmetdb entry to inhouse format
283 ## Input : $entries
284 ## Output : $tsv_handler
285 ## Usage : my ( $tsv_handler ) = cowmetdb_hash_to_inhouse_format( $entries ) ;
286
287 =cut
288 ## START of SUB
289 sub cowmetdb_hash_to_inhouse_format {
290 ## Retrieve Values
291 my $self = shift ;
292 my ( $entries ) = @_ ;
293
294 my @fields_name = ('HMDB_ID', 'COMMON_NAME', 'CAS', 'FORMULA', 'MZ_MONO', 'MZ_AVERAGE', 'MZ_[M+H]+', 'MZ_[M-H]-', 'KEGG_ID', 'BIOCYC_ID', 'INCHI', 'LOCATION', 'TAXONOMY', 'IUPAC') ;
295 my @tsv_handler = () ;
296 push (@tsv_handler, \@fields_name) ; ## first line
297
298 foreach my $entry (@$entries) {
299 my @tmp = ( $entry->{'HMDB_ID'}, $entry->{'COMMON_NAME'}, $entry->{'CAS'}, $entry->{'FORMULA'}, $entry->{'MZ_MONO'}, $entry->{'MZ_AVERAGE'}, $entry->{'MZ_[M+H]+'}, $entry->{'MZ_[M-H]-'}, $entry->{'KEGG_ID'}, $entry->{'BIOCYC_ID'},
300 $entry->{'INCHI'}, $entry->{'LOCATION'}, $entry->{'TAXONOMY'}, $entry->{'IUPAC'} ) ;
301 push (@tsv_handler, \@tmp) ; ## one entry by one line
302 }
303
304 return(\@tsv_handler) ;
305 }
306 ## END of SUB
307
308
309 1 ;
310
311
312 __END__
313
314 =head1 SUPPORT
315
316 You can find documentation for this module with the perldoc command.
317
318 perldoc XXX.pm
319
320 =head1 Exports
321
322 =over 4
323
324 =item :ALL is ...
325
326 =back
327
328 =head1 AUTHOR
329
330 Franck Giacomoni E<lt>franck.giacomoni@clermont.inra.frE<gt>
331
332 =head1 LICENSE
333
334 This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself.
335
336 =head1 VERSION
337
338 version 1 : xx / xx / 201x
339
340 version 2 : ??
341
342 =cut