Mercurial > repos > fgiacomoni > downloader_bank_hmdb
comparison lib/hmdb_api.pm @ 0:7c9269bded0e draft
Init repository for [downloader_bank_hmdb]
author | fgiacomoni |
---|---|
date | Tue, 14 Jan 2020 05:21:23 -0500 |
parents | |
children | be504ccbc41c |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:7c9269bded0e |
---|---|
1 package hmdb_api ; | |
2 | |
3 use strict; | |
4 use warnings ; | |
5 use Exporter ; | |
6 use Carp ; | |
7 | |
8 use Data::Dumper ; | |
9 use XML::Twig ; | |
10 | |
11 use csv ; | |
12 | |
13 use vars qw($VERSION @ISA @EXPORT %EXPORT_TAGS); | |
14 | |
15 our $VERSION = "1.0"; | |
16 our @ISA = qw(Exporter); | |
17 our @EXPORT = qw( getMetaboliteFeatures cowmetdb_handle cowmetdb_hash cowmetdb_hash_to_inhouse_format buildMetabolitesArray setMetaboliteAcurrateMzToModesMz); | |
18 our %EXPORT_TAGS = ( ALL => [qw( getMetaboliteFeatures cowmetdb_handle cowmetdb_hash cowmetdb_hash_to_inhouse_format buildMetabolitesArray setMetaboliteAcurrateMzToModesMz)] ); | |
19 | |
20 =head1 NAME | |
21 | |
22 My::Module - An example module | |
23 | |
24 =head1 SYNOPSIS | |
25 | |
26 use My::Module; | |
27 my $object = My::Module->new(); | |
28 print $object->as_string; | |
29 | |
30 =head1 DESCRIPTION | |
31 | |
32 This module does not really exist, it | |
33 was made for the sole purpose of | |
34 demonstrating how POD works. | |
35 | |
36 =head1 METHODS | |
37 | |
38 Methods are : | |
39 | |
40 =head2 METHOD new | |
41 | |
42 ## Description : new | |
43 ## Input : $self | |
44 ## Ouput : bless $self ; | |
45 ## Usage : new() ; | |
46 | |
47 =cut | |
48 | |
49 sub new { | |
50 ## Variables | |
51 my $self={}; | |
52 bless($self) ; | |
53 return $self ; | |
54 } | |
55 ### END of SUB | |
56 | |
57 | |
58 =head2 METHOD cowmetdb_handle | |
59 | |
60 ## Description : open a flat file and push the contains in memory - compute entries number. | |
61 ## Input : $flat | |
62 ## Output : $handler, $entries | |
63 ## Usage : my ( $handler ) = cowmetdb_handle( $flat ) ; | |
64 | |
65 =cut | |
66 ## START of SUB | |
67 sub cowmetdb_handle { | |
68 ## Retrieve Values | |
69 my $self = shift ; | |
70 my ( $flat ) = @_ ; | |
71 | |
72 my @handle = () ; | |
73 my $entries = 0 ; | |
74 my ( $begin, $end ) = ( 0, 0 ) ; | |
75 | |
76 if ( -e $flat ) { | |
77 open(FILE, "<$flat") or die "Cant' read the file $flat\n" ; | |
78 while (my $line = <FILE>){ | |
79 chomp $line ; | |
80 push(@handle, $line) ; | |
81 if ( $line =~ /^#BEGIN_METABOCARD/ ) { $begin = 1 ; } | |
82 elsif ( ( $line =~ /^#END_METABOCARD/ ) and ( $begin == 1 ) ){ $end = 1 ; } | |
83 ## count entries | |
84 if ( ( $end == 1 ) and ( $begin == 1 ) ){ $entries++ ; ( $begin, $end ) = ( 0, 0 ) ; } | |
85 } | |
86 close(FILE) ; | |
87 } | |
88 else { | |
89 croak "Can't find the source file $flat\n" ; | |
90 } | |
91 | |
92 return(\@handle, \$entries) ; | |
93 } | |
94 ## END of SUB | |
95 | |
96 =head2 METHOD cowmetdb_hash | |
97 | |
98 ## Description : work on a hmdb flat text handler and field data (selected fields), build a hash for each found entry | |
99 ## Input : $handler | |
100 ## Output : $entries | |
101 ## Usage : my ( $entries ) = hmdb_hash( $handler ) ; | |
102 | |
103 =cut | |
104 ## START of SUB | |
105 sub cowmetdb_hash { | |
106 ## Retrieve Values | |
107 my $self = shift ; | |
108 my ( $handle ) = @_ ; | |
109 | |
110 my @entries = () ; | |
111 my %entry = () ; | |
112 my $pos = 0 ; | |
113 | |
114 if ( ( defined $handle ) ) { | |
115 foreach my $data ( @$handle ) { | |
116 | |
117 if( $data =~ /^#BEGIN_METABOCARD/ ) { %entry = () ; } | |
118 elsif( $data =~ /^#END_METABOCARD/ ) { my %temp = %entry ; push (@entries, \%temp) ; } | |
119 elsif( $data =~ /^# name:/ ) { $entry{'COMMON_NAME'} = $handle->[$pos+1] ; } | |
120 elsif( $data =~ /^# iupac:/ ) { $entry{'IUPAC'} = $handle->[$pos+1] ; } | |
121 elsif( $data =~ /^# kegg_compound_id:/ ) { $entry{'KEGG_ID'} = $handle->[$pos+1] ; } | |
122 elsif( $data =~ /^# chemical_formula:/ ) { $entry{'FORMULA'} = $handle->[$pos+1] ; } | |
123 elsif( $data =~ /^# taxonomy_super_class:/ ) { $entry{'TAXONOMY'} = $handle->[$pos+1] ; } | |
124 elsif( $data =~ /^# cas_number:/ ) { $entry{'CAS'} = $handle->[$pos+1] ; } | |
125 elsif( $data =~ /^# biofluid_location:/ ) { $entry{'LOCATION'} = $handle->[$pos+1] ; } | |
126 elsif( $data =~ /^# inchi_identifier:/ ) { $entry{'INCHI'} = $handle->[$pos+1] ; } | |
127 elsif( $data =~ /^# weight_average:/ ) { $entry{'MZ_AVERAGE'} = $handle->[$pos+1] ; } | |
128 elsif( $data =~ /^# weight_mono:/ ) { $entry{'MZ_MONO'} = $handle->[$pos+1] ; } | |
129 elsif( $data =~ /^# biocyc_id:/ ) { $entry{'BIOCYC_ID'} = $handle->[$pos+1] ; } | |
130 elsif( $data =~ /^# hmdb_id:/ ) { $entry{'HMDB_ID'} = $handle->[$pos+1] ; } | |
131 | |
132 $pos++ ; | |
133 } | |
134 } | |
135 else { | |
136 croak "Handle is not defined : parsing step impossible\n" ; | |
137 } | |
138 | |
139 return(\@entries) ; | |
140 } | |
141 ## END of SUB | |
142 | |
143 | |
144 | |
145 =head2 METHOD getMetaboliteFeatures | |
146 | |
147 ## Description : get metabolites features from a xml file | |
148 ## Input : $xmlFile, | |
149 ## Output : $metabolites | |
150 ## Usage : $metabolites = getMetaboliteFeatures($xmlFile) ; | |
151 | |
152 =cut | |
153 sub getMetaboliteFeatures { | |
154 ## Retrieve Values | |
155 my $self = shift ; | |
156 my ( $xmlFile ) = @_ ; | |
157 | |
158 my %metabolites = () ; | |
159 my $twig = undef ; | |
160 my $id = undef ; | |
161 | |
162 if (-e $xmlFile) { | |
163 | |
164 $twig = XML::Twig->nparse_ppe( | |
165 | |
166 twig_handlers => { | |
167 'metabolite/accession' => sub {$id = $_ -> text_only ; $metabolites{$id} = undef ; } , | |
168 # metabolite name | |
169 'metabolite/name' => sub { $metabolites{$id}{'metabolite_name'} = $_ -> text_only ; } , | |
170 # metabolite chemical_formula | |
171 'metabolite/chemical_formula' => sub { $metabolites{$id}{'chemical_formula'} = $_ -> text_only ; } , | |
172 # metabolite monisotopic_molecular_weight | |
173 'metabolite/monisotopic_molecular_weight' => sub { $metabolites{$id}{'monisotopic_molecular_weight'} = $_ -> text_only ; } , ## general case | |
174 'metabolite/monisotopic_moleculate_weight' => sub { $metabolites{$id}{'monisotopic_molecular_weight'} = $_ -> text_only ; } , ## | |
175 # metabolite inchikey | |
176 'metabolite/inchikey' => sub { $metabolites{$id}{'inchikey'} = $_ -> text_only ; } , | |
177 }, | |
178 pretty_print => 'indented', | |
179 error_context => 1, $xmlFile | |
180 ); | |
181 | |
182 # $twig->print; | |
183 $twig->purge ; | |
184 } | |
185 | |
186 ## get number of entries: | |
187 my $X = keys %metabolites ; | |
188 | |
189 return (\%metabolites, $X) ; | |
190 | |
191 | |
192 } | |
193 ### END of SUB | |
194 | |
195 =head2 METHOD setMetaboliteAcurrateMzToModesMz | |
196 | |
197 ## Description : set M+H and M-H masses from a metabolite (M) accurate mass | |
198 ## Input : $metabolites, $proton_mass, $electron_mass | |
199 ## Output : $mzsMetabolites | |
200 ## Usage : my ( $mzsMetabolites ) = setMetaboliteAcurrateMzToModesMz ( $metabolites, $proton_mass, $electron_mass ) ; | |
201 | |
202 =cut | |
203 ## START of SUB | |
204 sub setMetaboliteAcurrateMzToModesMz { | |
205 ## Retrieve Values | |
206 my $self = shift ; | |
207 my ( $format, $metabolites, $proton_mass, $electron_mass, $charge ) = @_; | |
208 | |
209 if ($format eq 'XML') { | |
210 foreach my $id (sort keys %{$metabolites}) { | |
211 if ( $metabolites->{$id}{'monisotopic_molecular_weight'} ) { | |
212 my $tmp_mass = $metabolites->{$id}{'monisotopic_molecular_weight'} ; | |
213 $metabolites->{$id}{'[M+H]+'} = ( $tmp_mass + $proton_mass - $electron_mass) * $charge ; | |
214 $metabolites->{$id}{'[M-H]-'} = ( $tmp_mass - $proton_mass + $electron_mass) * $charge ; | |
215 } | |
216 else { | |
217 warn "No monisotopic_molecular_weight field exists with metabolite $id\n " ; | |
218 } | |
219 } | |
220 } | |
221 elsif ( ($format eq 'CARD') ) { | |
222 foreach my $entry (@$metabolites) { | |
223 if ( $entry->{'MZ_MONO'} ) { | |
224 my $tmp_mass = $entry->{'MZ_MONO'} ; | |
225 $entry->{'MZ_[M+H]+'} = ( $tmp_mass + $proton_mass - $electron_mass) * $charge ; | |
226 $entry->{'MZ_[M-H]-'} = ( $tmp_mass - $proton_mass + $electron_mass) * $charge ; | |
227 } | |
228 else { | |
229 warn "No MZ_MONO field exists with metabolite $entry->{'HMDB_ID'}\n " ; | |
230 } | |
231 } | |
232 } | |
233 | |
234 | |
235 return ($metabolites) ; | |
236 } | |
237 ### END of SUB | |
238 | |
239 =head2 METHOD buildMetabolitesArray | |
240 | |
241 ## Description : build a metabolite list from xml extraction | |
242 ## Input : $metabolites, $headers | |
243 ## Output : $metabolitesSorted | |
244 ## Usage : my ( $metabolitesSorted ) = buildMetabolitesArray ( $metabolites, $headers ) ; | |
245 | |
246 =cut | |
247 ## START of SUB | |
248 sub buildMetabolitesArray { | |
249 ## Retrieve Values | |
250 my $self = shift ; | |
251 my ( $metabolites, $headers ) = @_; | |
252 my ( @metabolitesSorted ) = ( () ) ; | |
253 | |
254 ## header format is ['HMDB_ID','MzBank', 'MetName', 'ChemFormula', 'INChIkey'] | |
255 if (defined $headers) { | |
256 push ( @metabolitesSorted, $headers ) ; | |
257 } | |
258 else { | |
259 push ( @metabolitesSorted, ['HMDB_ID','MzBank', '[M+H]+', '[M-H]-', 'MetName', 'ChemFormula', 'INChIkey'] ) ; | |
260 } | |
261 | |
262 foreach my $id (sort keys %{$metabolites}) { | |
263 my @tmp = () ; | |
264 push (@tmp, $id) ; | |
265 push (@tmp, $metabolites->{$id}{'monisotopic_molecular_weight'}) ; | |
266 push (@tmp, $metabolites->{$id}{'[M+H]+'}) ; | |
267 push (@tmp, $metabolites->{$id}{'[M-H]-'}) ; | |
268 push (@tmp, $metabolites->{$id}{'metabolite_name'}) ; | |
269 push (@tmp, $metabolites->{$id}{'chemical_formula'}) ; | |
270 push (@tmp, $metabolites->{$id}{'inchikey'}) ; | |
271 | |
272 # merge | |
273 push (@metabolitesSorted, \@tmp) ; | |
274 } | |
275 | |
276 return (\@metabolitesSorted) ; | |
277 } | |
278 ### END of SUB | |
279 | |
280 =head2 METHOD cowmetdb_hash_to_inhouse_format | |
281 | |
282 ## Description : adaptator from hash cowmetdb entry to inhouse format | |
283 ## Input : $entries | |
284 ## Output : $tsv_handler | |
285 ## Usage : my ( $tsv_handler ) = cowmetdb_hash_to_inhouse_format( $entries ) ; | |
286 | |
287 =cut | |
288 ## START of SUB | |
289 sub cowmetdb_hash_to_inhouse_format { | |
290 ## Retrieve Values | |
291 my $self = shift ; | |
292 my ( $entries ) = @_ ; | |
293 | |
294 my @fields_name = ('HMDB_ID', 'COMMON_NAME', 'CAS', 'FORMULA', 'MZ_MONO', 'MZ_AVERAGE', 'MZ_[M+H]+', 'MZ_[M-H]-', 'KEGG_ID', 'BIOCYC_ID', 'INCHI', 'LOCATION', 'TAXONOMY', 'IUPAC') ; | |
295 my @tsv_handler = () ; | |
296 push (@tsv_handler, \@fields_name) ; ## first line | |
297 | |
298 foreach my $entry (@$entries) { | |
299 my @tmp = ( $entry->{'HMDB_ID'}, $entry->{'COMMON_NAME'}, $entry->{'CAS'}, $entry->{'FORMULA'}, $entry->{'MZ_MONO'}, $entry->{'MZ_AVERAGE'}, $entry->{'MZ_[M+H]+'}, $entry->{'MZ_[M-H]-'}, $entry->{'KEGG_ID'}, $entry->{'BIOCYC_ID'}, | |
300 $entry->{'INCHI'}, $entry->{'LOCATION'}, $entry->{'TAXONOMY'}, $entry->{'IUPAC'} ) ; | |
301 push (@tsv_handler, \@tmp) ; ## one entry by one line | |
302 } | |
303 | |
304 return(\@tsv_handler) ; | |
305 } | |
306 ## END of SUB | |
307 | |
308 | |
309 1 ; | |
310 | |
311 | |
312 __END__ | |
313 | |
314 =head1 SUPPORT | |
315 | |
316 You can find documentation for this module with the perldoc command. | |
317 | |
318 perldoc XXX.pm | |
319 | |
320 =head1 Exports | |
321 | |
322 =over 4 | |
323 | |
324 =item :ALL is ... | |
325 | |
326 =back | |
327 | |
328 =head1 AUTHOR | |
329 | |
330 Franck Giacomoni E<lt>franck.giacomoni@clermont.inra.frE<gt> | |
331 | |
332 =head1 LICENSE | |
333 | |
334 This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself. | |
335 | |
336 =head1 VERSION | |
337 | |
338 version 1 : xx / xx / 201x | |
339 | |
340 version 2 : ?? | |
341 | |
342 =cut |