Mercurial > repos > galaxyp > mqppep_preproc
comparison PhosphoPeptide_Upstream_Kinase_Mapping.pl @ 0:8dfd5d2b5903 draft
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit 3a7b3609d6e514c9e8f980ecb684960c6b2252fe
author | galaxyp |
---|---|
date | Mon, 11 Jul 2022 19:22:54 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:8dfd5d2b5903 |
---|---|
1 #!/usr/local/bin/perl | |
2 ############################################################################################################################### | |
3 # perl Kinase_enrichment_analysis_complete_v0.pl | |
4 # | |
5 # Nick Graham, USC | |
6 # 2016-02-27 | |
7 # | |
8 # Built from scripts written by NG at UCLA in Tom Graeber's lab: | |
9 # CombinePhosphoSites.pl | |
10 # Retrieve_p_motifs.pl | |
11 # NetworKIN_Motif_Finder_v7.pl | |
12 # | |
13 # Given a list of phospho-peptides, find protein information and upstream kinases. | |
14 # Output file can be used for KS enrichment score calculations using Enrichment_Score4Directory.pl | |
15 # | |
16 # Updated 2022-01-13, Art Eschenlauer, UMN on behalf of Justin Drake's lab: | |
17 # Added warnings and used strict; | |
18 # fixed some code paths resulting in more NetworKIN matches; | |
19 # applied Aho-Corasick algorithm (via external Python script because Perl implementation was still too slow) | |
20 # to speed up "Match the non_p_peptides to the @sequences array"; | |
21 # added support for SQLite-formatted UniProtKB/Swiss-Prot data as an alternative to FASTA-formatted data; | |
22 # added support for SQLite output in addition to tabular files. | |
23 # | |
24 # | |
25 ############################################################################################################################### | |
26 | |
27 use strict; | |
28 use warnings 'FATAL' => 'all'; | |
29 | |
30 use Getopt::Std; | |
31 use DBD::SQLite::Constants qw/:file_open/; | |
32 use DBI qw(:sql_types); | |
33 use File::Copy; | |
34 use File::Basename; | |
35 use POSIX qw(strftime); | |
36 use Time::HiRes qw(gettimeofday); | |
37 #use Data::Dump qw(dump); | |
38 | |
39 my $USE_SEARCH_PPEP_PY = 1; | |
40 #my $FAILED_MATCH_SEQ = "Failed match"; | |
41 my $FAILED_MATCH_SEQ = 'No Sequence'; | |
42 my $FAILED_MATCH_GENE_NAME = 'No_Gene_Name'; | |
43 | |
44 my $dirname = dirname(__FILE__); | |
45 my %opts; | |
46 my ($file_in, $average_or_sum, $db_out, $file_out, $file_melt, $phospho_type); | |
47 my $dbtype; | |
48 my ($fasta_in, $networkin_in, $motifs_in, $PSP_Kinase_Substrate_in, $PSP_Regulatory_Sites_in); | |
49 my (@samples, %sample_id_lut, %ppep_id_lut, %data, @tmp_data, %n); | |
50 my $line = 0; | |
51 my @failed_match = ($FAILED_MATCH_SEQ); | |
52 my @failed_matches; | |
53 my (%all_data); | |
54 my (@p_peptides, @non_p_peptides); | |
55 my @parsed_fasta; | |
56 my (@accessions, @names, @sequences, @databases, $database); | |
57 my ($dbfile, $dbh, $stmth); | |
58 my @col_names; | |
59 my (%matched_sequences, %accessions, %names, %sites, ); | |
60 my (@tmp_matches, @tmp_accessions, @tmp_names, @tmp_sites); | |
61 my (%p_residues, @tmp_p_residues, @p_sites, $left, $right, %p_motifs, @tmp_motifs_array, $tmp_motif, $tmp_site, %residues); | |
62 my (@kinases_observed, $kinases); | |
63 my (@kinases_observed_lbl, @phosphosites_observed_lbl); | |
64 my ($p_sequence_kinase, $p_sequence, $kinase); | |
65 my (@motif_sequence, @motif_description, @motif_type_key_ary, %motif_type, %motif_count); | |
66 my (@kinases_PhosphoSite, $kinases_PhosphoSite); | |
67 my ($p_sequence_kinase_PhosphoSite, $p_sequence_PhosphoSite, $kinase_PhosphoSite); | |
68 my (%regulatory_sites_PhosphoSite_hash); | |
69 my (%domain, %ON_FUNCTION, %ON_PROCESS, %ON_PROT_INTERACT, %ON_OTHER_INTERACT, %notes, %organism); | |
70 my (%unique_motifs); | |
71 my ($kinase_substrate_NetworKIN_matches, $kinase_substrate_PhosphoSite_matches); | |
72 my %psp_regsite_protein_2; | |
73 my (%domain_2, %ON_FUNCTION_2, %ON_PROCESS_2, %ON_PROT_INTERACT_2, %N_PROT_INTERACT, %ON_OTHER_INTERACT_2, %notes_2, %organism_2); | |
74 my @timeData; | |
75 my $PhosphoSitePlusCitation; | |
76 my (%site_description, %site_id); | |
77 | |
78 my %kinase_substrate_NetworKIN_matches; | |
79 my %kinase_motif_matches; | |
80 my $regulatory_sites_PhosphoSite; | |
81 my ($seq_plus5aa, $seq_plus7aa, %seq_plus7aa_2); | |
82 my %kinase_substrate_PhosphoSite_matches; | |
83 my @formatted_sequence; | |
84 my $pSTY_sequence; | |
85 my $i; | |
86 my @a; | |
87 my $use_sqlite; | |
88 my $verbose; | |
89 | |
90 ########## | |
91 ## opts ## | |
92 ########## | |
93 ## input files | |
94 # i : path to input file, e.g., 'outputfile_STEP2.txt' | |
95 # f : path to UniProtKB/SwissProt FASTA | |
96 # s : optional species argument | |
97 # n : path to NetworKIN_201612_cutoffscore2.0.txt | |
98 # m : path to pSTY_Motifs.txt | |
99 # p : path to 2017-03_PSP_Kinase_Substrate_Dataset.txt | |
100 # r : path to 2017-03_PSP_Regulatory_sites.txt | |
101 ## options | |
102 # P : phospho_type | |
103 # F : function | |
104 # v : verbose output | |
105 ## output files | |
106 # o : path to output file | |
107 # O : path to "melted" output file | |
108 # D : path to output SQLite file | |
109 | |
110 sub usage() | |
111 { | |
112 print STDERR <<"EOH"; | |
113 This program given a list of phospho-peptides, finds protein information and upstream kinases. | |
114 usage: $0 [-hvd] -f FASTA_file | |
115 -h : this (help) message | |
116 -v : slightly verbose | |
117 -a : use SQLite less | |
118 ## input files | |
119 -i : path to input file, e.g., 'outputfile_STEP2.txt' | |
120 -f : path to UniProtDB/SwissProt FASTA | |
121 -s : optional species filter argument for PSP records; defaults to 'human' | |
122 -n : path to NetworKIN_201612_cutoffscore2.0.txt | |
123 -m : path to pSTY_Motifs.txt | |
124 -p : path to 2017-03_PSP_Kinase_Substrate_Dataset.txt | |
125 -r : path to 2017-03_PSP_Regulatory_sites.txt | |
126 ## options | |
127 -P : phospho_type | |
128 -F : function | |
129 ## output files | |
130 -o : path to output file | |
131 -O : path to "melted" output file | |
132 -D : path to output SQLite file | |
133 example: $0 | |
134 EOH | |
135 exit; | |
136 } | |
137 | |
138 sub format_localtime_iso8601 { | |
139 # ref: https://perldoc.perl.org/Time::HiRes | |
140 my ($seconds, $microseconds) = gettimeofday; | |
141 # ref: https://pubs.opengroup.org/onlinepubs/9699919799/functions/strftime.html | |
142 return strftime("%Y-%m-%dT%H:%M:%S",localtime(time)) . sprintf(".%03d", $microseconds/1000); | |
143 } | |
144 | |
145 sub replace_pSpTpY { | |
146 my ($formatted_sequence, $phospho_type) = @_; | |
147 if ($phospho_type eq 'y') { | |
148 $formatted_sequence =~ s/pS/S/g; | |
149 $formatted_sequence =~ s/pT/T/g; | |
150 $formatted_sequence =~ s/pY/y/g; | |
151 } | |
152 elsif ($phospho_type eq "sty") { | |
153 $formatted_sequence =~ s/pS/s/g; | |
154 $formatted_sequence =~ s/pT/t/g; | |
155 $formatted_sequence =~ s/pY/y/g; | |
156 } | |
157 $formatted_sequence; | |
158 } | |
159 | |
160 sub pseudo_sed | |
161 { | |
162 # pseudo_sed produces "UniProt_ID\tDescription\tOS\tOX\tGN\tPE\tSV" | |
163 # Comments give the sed equivalent | |
164 my ($t) = @_; | |
165 my $s = $t; | |
166 # / GN=/!{ s:\(OX=[^ \t]*\):\1 GN=N/A:; }; | |
167 unless ($s =~ m / GN=/s) | |
168 { | |
169 $s =~ s :(OX=[^ \t]*):${1} GN=N/A:s; | |
170 } | |
171 # / PE=/!{ s:\(GN=[^ \t]*\):\1 PE=N/A:; }; | |
172 unless ($s =~ m / PE=/s) | |
173 { | |
174 $s =~ s :(GN=[^ \t]*):${1} PE=N/A:s; | |
175 } | |
176 # / SV=/!{ s:\(PE=[^ \t]*\):\1 SV=N/A:; }; | |
177 unless ($s =~ m / SV=/s) | |
178 { | |
179 $s =~ s :(PE=[^ \t]*):${1} SV=N/A:s; | |
180 } | |
181 # s/^sp.//; | |
182 $s =~ s :^...::s; | |
183 # s/[|]/\t/g; | |
184 $s =~ s :[|]:\t:sg; | |
185 if ( !($s =~ m/ OX=/s) | |
186 && !($s =~ m/ GN=/s) | |
187 && !($s =~ m/ PE=/s) | |
188 && !($s =~ m/ SV=/s) | |
189 ) { | |
190 # OS= is used elsewhere, but it's not helpful without OX and GN | |
191 $s =~ s/OS=/Species /g; | |
192 # supply sensible default values | |
193 $s .= "\tN/A\t-1\tN/A\tN/A\tN/A"; | |
194 } else { | |
195 # s/ OS=/\t/; | |
196 if ($s =~ m/ OS=/s) { $s =~ s: OS=:\t:s; } else { $s =~ s:(.*)\t:$1\tN/A\t:x; }; | |
197 # s/ OX=/\t/; | |
198 if ($s =~ m/ OX=/s) { $s =~ s: OX=:\t:s; } else { $s =~ s:(.*)\t:$1\t-1\t:x; }; | |
199 # s/ GN=/\t/; | |
200 if ($s =~ m/ GN=/s) { $s =~ s: GN=:\t:s; } else { $s =~ s:(.*)\t:$1\tN/A\t:x; }; | |
201 # s/ PE=/\t/; | |
202 if ($s =~ m/ PE=/s) { $s =~ s: PE=:\t:s; } else { $s =~ s:(.*)\t:$1\tN/A\t:x; }; | |
203 # s/ SV=/\t/; | |
204 if ($s =~ m/ SV=/s) { $s =~ s: SV=:\t:s; } else { $s =~ s:(.*)\t:$1\tN/A\t:x; }; | |
205 } | |
206 return $s; | |
207 } # sub pseudo_sed | |
208 | |
209 getopts('i:f:s:n:m:p:r:P:F:o:O:D:hva', \%opts) ; | |
210 | |
211 | |
212 if (exists($opts{'h'})) { | |
213 usage(); | |
214 } | |
215 if (exists($opts{'a'})) { | |
216 $USE_SEARCH_PPEP_PY = 0; | |
217 } | |
218 if (exists($opts{'v'})) { | |
219 $verbose = 1; | |
220 } else { | |
221 $verbose = 0; | |
222 } | |
223 if (!exists($opts{'i'}) || !-e $opts{'i'}) { | |
224 die('Input File not found'); | |
225 } else { | |
226 $file_in = $opts{'i'}; | |
227 } | |
228 if (!exists($opts{'f'}) || !-e $opts{'f'}) { | |
229 die('FASTA not found'); | |
230 } else { | |
231 $fasta_in = $opts{'f'}; | |
232 $use_sqlite = 0; | |
233 } | |
234 my $species; | |
235 if ((!exists($opts{'s'})) || ($opts{'s'} eq '')) { | |
236 $species = 'human'; | |
237 } else { | |
238 $species = $opts{'s'}; | |
239 print "'-s' option is '$species'\n"; | |
240 } | |
241 print "species filter is '$species'\n"; | |
242 | |
243 if (!exists($opts{'n'}) || !-e $opts{'n'}) { | |
244 die('Input NetworKIN File not found'); | |
245 } else { | |
246 $networkin_in = $opts{'n'}; | |
247 } | |
248 if (!exists($opts{'m'}) || !-e $opts{'m'}) { | |
249 die('Input pSTY_Motifs File not found'); | |
250 } else { | |
251 $motifs_in = $opts{'m'}; | |
252 } | |
253 if (!exists($opts{'p'}) || !-e $opts{'p'}) { | |
254 die('Input PSP_Kinase_Substrate_Dataset File not found'); | |
255 } else { | |
256 $PSP_Kinase_Substrate_in = $opts{'p'}; | |
257 } | |
258 if (!exists($opts{'r'}) || !-e $opts{'r'}) { | |
259 die('Input PSP_Regulatory_sites File not found'); | |
260 } else { | |
261 $PSP_Regulatory_Sites_in = $opts{'r'}; | |
262 } | |
263 if (exists($opts{'P'})) { | |
264 $phospho_type = $opts{'P'}; | |
265 } | |
266 else { | |
267 $phospho_type = "sty"; | |
268 } | |
269 if (exists($opts{'F'})) { | |
270 $average_or_sum = $opts{'F'}; | |
271 } | |
272 else { | |
273 $average_or_sum = "sum"; | |
274 } | |
275 if (exists($opts{'D'})) { | |
276 $db_out = $opts{'D'}; | |
277 } | |
278 else { | |
279 $db_out = "db_out.sqlite"; | |
280 } | |
281 if (exists($opts{'O'})) { | |
282 $file_melt = $opts{'O'}; | |
283 } | |
284 else { | |
285 $file_melt = "output_melt.tsv"; | |
286 } | |
287 if (exists($opts{'o'})) { | |
288 $file_out = $opts{'o'}; | |
289 } | |
290 else { | |
291 $file_out = "output.tsv"; | |
292 } | |
293 | |
294 | |
295 ############################################################################################################################### | |
296 # Print the relevant file names to the screen | |
297 ############################################################################################################################### | |
298 # print "\nData file: $data_in\nFASTA file: $fasta_in\nSpecies: $species\nOutput file: $motifs_out\n\n"; | |
299 print "\n--- parameters:\n"; | |
300 print "Data file: $file_in\nAverage or sum identical p-sites? $average_or_sum\nOutput file: $file_out\nMelted map: $file_melt\n"; | |
301 if ($use_sqlite == 0) { | |
302 print "Motifs file: $motifs_in\nNetworKIN file: networkin_in\nPhosphosite kinase substrate data: $PSP_Kinase_Substrate_in\nPhosphosite regulatory site data: $PSP_Regulatory_Sites_in\nUniProtKB/SwissProt FASTA file: $fasta_in\nOutput SQLite file: $db_out\n"; | |
303 } else { | |
304 print "Motifs file: $motifs_in\nNetworKIN file: networkin_in\nPhosphosite kinase substrate data: $PSP_Kinase_Substrate_in\nPhosphosite regulatory site data: $PSP_Regulatory_Sites_in\nUniProtKB/SwissProt SQLIte file: $dbfile\nOutput SQLite file: $db_out\n"; | |
305 } | |
306 print "...\n\n"; | |
307 | |
308 print "Phospho-residues(s) = $phospho_type\n\n"; | |
309 if ($phospho_type ne 'y') { | |
310 if ($phospho_type ne 'sty') { | |
311 die "\nUsage error:\nYou must choose a phospho-type, either y or sty\n\n"; | |
312 } | |
313 } | |
314 | |
315 ############################################################################################################################### | |
316 # read the input data file | |
317 # average or sum identical phospho-sites, depending on the value of $average_or_sum | |
318 ############################################################################################################################### | |
319 | |
320 open (IN, "$file_in") or die "I couldn't find the input file: $file_in\n"; | |
321 | |
322 die "\n\nScript died: You must choose either average or sum for \$average_or_sum\n\n" if (($average_or_sum ne "sum") && ($average_or_sum ne "average")) ; | |
323 | |
324 | |
325 $line = 0; | |
326 | |
327 while (<IN>) { | |
328 chomp; | |
329 my @x = split(/\t/); | |
330 for my $n (0 .. $#x) {$x[$n] =~ s/\r//g; $x[$n] =~ s/\n//g; $x[$n] =~ s/\"//g;} | |
331 | |
332 # Read in the samples | |
333 if ($line == 0) { | |
334 for my $n (1 .. $#x) { | |
335 push (@samples, $x[$n]); | |
336 $sample_id_lut{$x[$n]} = $n; | |
337 } | |
338 $line++; | |
339 } else { | |
340 # check whether we have already seen a phospho-peptide | |
341 if (exists($data{$x[0]})) { | |
342 if ($average_or_sum eq "sum") { # add the data | |
343 # unload the data | |
344 @tmp_data = (); foreach (@{$data{$x[0]}}) { push(@tmp_data, $_); } | |
345 # add the new data and repack | |
346 for my $k (0 .. $#tmp_data) { $tmp_data[$k] = $tmp_data[$k] + $x[$k+1]; } | |
347 $all_data{$x[0]} = (); for my $k (0 .. $#tmp_data) { push(@{$all_data{$x[0]}}, $tmp_data[$k]); } | |
348 | |
349 } elsif ($average_or_sum eq "average") { # average the data | |
350 # unload the data | |
351 @tmp_data = (); foreach (@{$all_data{$x[0]}}) { push(@tmp_data, $_); } | |
352 # average with the new data and repack | |
353 for my $k (0 .. $#tmp_data) { $tmp_data[$k] = ( $tmp_data[$k]*$n{$x[0]} + $x[0] ) / ($n{$x[0]} + 1); } | |
354 $n{$x[0]}++; | |
355 $data{$x[0]} = (); for my $k (0 .. $#tmp_data) { push(@{$data{$x[0]}}, $tmp_data[$k]); } | |
356 } | |
357 } | |
358 # if the phospho-sequence has not been seen, save the data | |
359 else { | |
360 for my $k (1 .. $#x) { push(@{$data{$x[0]}}, $x[$k]); } | |
361 $n{$x[0]} = 1; | |
362 } | |
363 } | |
364 } | |
365 close(IN); | |
366 | |
367 | |
368 ############################################################################################################################### | |
369 # Search the FASTA database for phospho-sites and motifs | |
370 # | |
371 # based on Retrieve_p_peptide_motifs_v2.pl | |
372 ############################################################################################################################### | |
373 | |
374 | |
375 ############################################################################################################################### | |
376 # | |
377 # Read in the Data file: | |
378 # 1) make @p_peptides array as in the original file | |
379 # 2) make @non_p_peptides array w/o residue modifications (p, #, other) | |
380 # | |
381 ############################################################################################################################### | |
382 | |
383 foreach my $peptide (keys %data) { | |
384 $peptide =~ s/s/pS/g; $peptide =~ s/t/pT/g; $peptide =~ s/y/pY/g; | |
385 push (@p_peptides, $peptide); | |
386 $peptide =~ s/p//g; | |
387 push(@non_p_peptides, $peptide); | |
388 } | |
389 | |
390 if ($use_sqlite == 0) { | |
391 ############################################################################################################################### | |
392 # | |
393 # Read in the UniProtKB/Swiss-Prot data from FASTA; save to @sequences array and SQLite output database | |
394 # | |
395 ############################################################################################################################### | |
396 | |
397 # e.g. | |
398 # >sp|Q9Y3B9|RRP15_HUMAN RRP15-like protein OS=Homo sapiens OX=9606 GN=RRP15 PE=1 SV=2 | |
399 # MAAAAPDSRVSEEENLKKTPKKKMKMVTGAVASVLEDEATDTSDSEGSCGSEKDHFYSDD | |
400 # DAIEADSEGDAEPCDKENENDGESSVGTNMGWADAMAKVLNKKTPESKPTILVKNKKLEK | |
401 # EKEKLKQERLEKIKQRDKRLEWEMMCRVKPDVVQDKETERNLQRIATRGVVQLFNAVQKH | |
402 # QKNVDEKVKEAGSSMRKRAKLISTVSKKDFISVLRGMDGSTNETASSRKKPKAKQTEVKS | |
403 # EEGPGWTILRDDFMMGASMKDWDKESDGPDDSRPESASDSDT | |
404 # accession: Q9Y3B9 | |
405 # name: RRP15_HUMAN RRP15-like protein OS=Homo sapiens OX=9606 GN=RRP15 PE=1 SV=2 | |
406 # sequence: MAAAAPDSRVSEEENLKKTPKKKMKMVTGAVASVLEDEATDTSDSEGSCGSEKDHFYSDD DAIEADSEGDAEPCDKENENDGESSVGTNMGWADAMAKVLNKKTPESKPTILVKNKKLEK EKEKLKQERLEKIKQRDKRLEWEMMCRVKPDVVQDKETERNLQRIATRGVVQLFNAVQKH QKNVDEKVKEAGSSMRKRAKLISTVSKKDFISVLRGMDGSTNETASSRKKPKAKQTEVKS EEGPGWTILRDDFMMGASMKDWDKESDGPDDSRPESASDSDT | |
407 # | |
408 # e.g. | |
409 # >gi|114939|sp|P00722.2|BGAL_ECOLI Beta-galactosidase (Lactase) cRAP | |
410 # >gi|52001466|sp|P00366.2|DHE3_BOVIN Glutamate dehydrogenase 1, mitochondrial precursor (GDH) cRAP | |
411 # | |
412 # e.g. | |
413 # >zs|P00009.24.AR-V2_1.zs|zs_peptide_0024_AR-V2_1 | |
414 | |
415 | |
416 open (IN1, "$fasta_in") or die "I couldn't find $fasta_in\n"; | |
417 print "Reading FASTA file $fasta_in\n"; | |
418 # ref: https://perldoc.perl.org/perlsyn#Compound-Statements | |
419 # "If the condition expression of a while statement is based on any of | |
420 # a group of iterative expression types then it gets some magic treatment. | |
421 # The affected iterative expression types are readline, the <FILEHANDLE> | |
422 # input operator, readdir, glob, the <PATTERN> globbing operator, and | |
423 # `each`. If the condition expression is one of these expression types, | |
424 # then the value yielded by the iterative operator will be implicitly | |
425 # assigned to `$_`." | |
426 while (<IN1>) { | |
427 chomp; | |
428 # ref: https://perldoc.perl.org/functions/split#split-/PATTERN/,EXPR | |
429 # "If only PATTERN is given, EXPR defaults to $_." | |
430 my (@x) = split(/\|/); | |
431 # begin FIX >gi|114939|sp|P00722.2|BGAL_ECOLI Beta-galactosidase (Lactase) cRAP | |
432 if (@x > 3) { | |
433 @x = (">".$x[$#x - 2], $x[$#x - 1], $x[$#x]); | |
434 } | |
435 # end FIX >gi|114939|sp|P00722.2|BGAL_ECOLI Beta-galactosidase (Lactase) cRAP | |
436 for my $i (0 .. $#x) { | |
437 $x[$i] =~ s/\r//g; $x[$i] =~ s/\n//g; $x[$i] =~ s/\"//g; } | |
438 # Use of uninitialized value $x[0] in pattern match (m//) at /home/rstudio/src/mqppep/tools/mqppep/PhosphoPeptide_Upstream_Kinase_Mapping.pl line 411, <IN1> line 3. | |
439 if (exists($x[0])) { | |
440 if ($x[0] =~ /^>/) { | |
441 # parsing header line | |
442 $x[0] =~ s/\>//g; | |
443 push (@databases, $x[0]); | |
444 push (@accessions, $x[1]); | |
445 push (@names, $x[2]); | |
446 # format tags of standard UniProtKB headers as tab-separated values | |
447 # pseudo_sed produces "UniProt_ID\tDescription\tOS\tOX\tGN\tPE\tSV" | |
448 $_ = pseudo_sed(join "\t", (">".$x[0], $x[1], $x[2])); | |
449 # append tab as separator between header and sequence | |
450 s/$/\t/; | |
451 # parsed_fasta gets "UniProt_ID\tDescription\tOS\tOX\tGN\tPE\tSV\t" | |
452 print "push (\@parsed_fasta, $_)\n" if (0 && $x[0] ne "zs"); | |
453 push (@parsed_fasta, $_); | |
454 } elsif ($x[0] =~ /^\w/) { | |
455 # line is a portion of the sequence | |
456 if (defined $sequences[$#accessions]) { | |
457 $sequences[$#accessions] = $sequences[$#accessions].$x[0]; | |
458 } else { | |
459 $sequences[$#accessions] = $x[0]; | |
460 } | |
461 $parsed_fasta[$#accessions] = $parsed_fasta[$#accessions].$x[0]; | |
462 } | |
463 } | |
464 } | |
465 close IN1; | |
466 print "Done Reading FASTA file $fasta_in\n"; | |
467 $dbfile = $db_out; | |
468 print "Begin writing $dbfile at " . format_localtime_iso8601() . "\n"; | |
469 $dbh = DBI->connect("dbi:SQLite:$dbfile", undef, undef); | |
470 my $auto_commit = $dbh->{AutoCommit}; | |
471 print "auto_commit was $auto_commit and is now 0\n" if ($verbose); | |
472 $dbh->{AutoCommit} = 0; | |
473 | |
474 # begin DDL-to-SQLite | |
475 # --- | |
476 $stmth = $dbh->prepare(" | |
477 DROP TABLE IF EXISTS UniProtKB; | |
478 "); | |
479 $stmth->execute(); | |
480 | |
481 $stmth = $dbh->prepare(" | |
482 CREATE TABLE UniProtKB ( | |
483 Uniprot_ID TEXT PRIMARY KEY ON CONFLICT IGNORE, | |
484 Description TEXT, | |
485 Organism_Name TEXT, | |
486 Organism_ID INTEGER, | |
487 Gene_Name TEXT, | |
488 PE TEXT, | |
489 SV TEXT, | |
490 Sequence TEXT, | |
491 Database TEXT | |
492 ) | |
493 "); | |
494 $stmth->execute(); | |
495 $stmth = $dbh->prepare(" | |
496 CREATE UNIQUE INDEX idx_uniq_UniProtKB_0 on UniProtKB(Uniprot_ID); | |
497 "); | |
498 $stmth->execute(); | |
499 $stmth = $dbh->prepare(" | |
500 CREATE INDEX idx_UniProtKB_0 on UniProtKB(Gene_Name); | |
501 "); | |
502 $stmth->execute(); | |
503 # ... | |
504 # end DDL-to-SQLite | |
505 | |
506 # insert all rows | |
507 # begin store-to-SQLite "UniProtKB" table | |
508 # --- | |
509 $stmth = $dbh->prepare(" | |
510 INSERT INTO UniProtKB ( | |
511 Uniprot_ID, | |
512 Description, | |
513 Organism_Name, | |
514 Organism_ID, | |
515 Gene_Name, | |
516 PE, | |
517 SV, | |
518 Sequence, | |
519 Database | |
520 ) VALUES (?,?,?,?,?,?,?,?,?) | |
521 "); | |
522 my $row_count = 1; | |
523 my $row_string; | |
524 my (@row, @rows); | |
525 my $wrd; | |
526 while ( scalar @parsed_fasta > 0 ) { | |
527 $database = $databases[$#parsed_fasta]; | |
528 # row_string gets "UniProt_ID\tDescription\tOS\tOX\tGN\tPE\tSV\t" | |
529 # 1 2 3 4 5 6 7 sequence database | |
530 $row_string = pop(@parsed_fasta); | |
531 @row = (split /\t/, $row_string); | |
532 if ((not exists($row[4])) || ($row[4] eq "")) { | |
533 die("invalid fasta line\n$row_string\n"); | |
534 }; | |
535 if ($row[4] eq "N/A") { | |
536 print "Organism_ID is 'N/A' for row $row_count:\n'$row_string'\n"; | |
537 $row[4] = -1; | |
538 }; | |
539 for $i (1..3,5..8) { | |
540 #BIND print "bind_param $i, $row[$i]\n"; | |
541 $stmth->bind_param($i, $row[$i]); | |
542 } | |
543 #BIND print "bind_param 4, $row[4]\n"; | |
544 $stmth->bind_param(9, $database); | |
545 #BIND print "bind_param 4, $row[4]\n"; | |
546 $stmth->bind_param(4, $row[4], { TYPE => SQL_INTEGER }); | |
547 if (not $stmth->execute()) { | |
548 print "Error in row $row_count: " . $dbh->errstr . "\n"; | |
549 print "Row $row_count: $row_string\n"; | |
550 print "Row $row_count: " . ($row_string =~ s/\t/@/g) . "\n"; | |
551 } | |
552 if (0 && $database ne "zs") { | |
553 print "row_count: $row_count\n"; | |
554 #### print "row_string: $row_string\n"; | |
555 print "Row $row_count: $row_string\n"; | |
556 for $i (1..3,5..8) { | |
557 print "bind_param $i, $row[$i]\n" if (exists($row[$i])); | |
558 } | |
559 print "bind_param 4, $row[4]\n" if (exists($row[4])); | |
560 print "bind_param 9, $database\n"; | |
561 }; | |
562 $row_count += 1; | |
563 } | |
564 # ... | |
565 # end store-to-SQLite "UniProtKB" table | |
566 | |
567 print "begin commit at " . format_localtime_iso8601() . "\n"; | |
568 $dbh->{AutoCommit} = $auto_commit; | |
569 print "auto_commit is now $auto_commit\n" if ($verbose); | |
570 $dbh->disconnect if ( defined $dbh ); | |
571 print "Finished writing $dbfile at " . format_localtime_iso8601() . "\n\n"; | |
572 $dbtype = "FASTA"; | |
573 } | |
574 | |
575 if ($use_sqlite == 1) { | |
576 ############################################################################################################################### | |
577 # | |
578 # Read in the UniProtKB/Swiss-Prot data from SQLite; save to @sequences array | |
579 # | |
580 ############################################################################################################################### | |
581 | |
582 copy($dbfile, $db_out) or die "Copy $dbfile to $db_out failed: $!"; | |
583 | |
584 # https://metacpan.org/pod/DBD::SQLite#Read-Only-Database | |
585 $dbh = DBI->connect("dbi:SQLite:$dbfile", undef, undef, { | |
586 sqlite_open_flags => SQLITE_OPEN_READONLY, | |
587 }); | |
588 print "DB connection $dbh is to $dbfile\n"; | |
589 | |
590 # Uniprot_ID, Description, Organism_Name, Organism_ID, Gene_Name, PE, SV, Sequence | |
591 $stmth = $dbh->prepare(" | |
592 SELECT Uniprot_ID | |
593 , Description | |
594 || CASE WHEN Organism_Name = 'N/A' THEN '' ELSE ' OS=' || Organism_Name END | |
595 || CASE WHEN Organism_ID = -1 THEN '' ELSE ' OX=' || Organism_ID END | |
596 || CASE WHEN Gene_Name = 'N/A' THEN '' ELSE ' GN=' || Gene_Name END | |
597 || CASE WHEN PE = 'N/A' THEN '' ELSE ' PE=' || PE END | |
598 || CASE WHEN SV = 'N/A' THEN '' ELSE ' SV=' || SV END | |
599 AS Description | |
600 , Sequence | |
601 , Database | |
602 FROM | |
603 UniProtKB | |
604 "); | |
605 $stmth->execute(); | |
606 @col_names = @{$stmth->{NAME}}; | |
607 print "\nColumn names selected from UniProtKB SQLite table: " . join(", ", @col_names) . "\n\n" if ($verbose); | |
608 while (my @row = $stmth->fetchrow_array) { | |
609 push (@names, $row[1]); # redacted Description | |
610 push (@accessions, $row[0]); # Uniprot_ID | |
611 $sequences[$#accessions] = $row[2]; # Sequence | |
612 push (@databases, $row[3]); # Database (should be 'sp') | |
613 } | |
614 | |
615 $dbh->disconnect if ( defined $dbh ); | |
616 | |
617 print "Done Reading UniProtKB/Swiss-Prot file $dbfile\n\n"; | |
618 $dbtype = "SQLite"; | |
619 } | |
620 | |
621 print "$#accessions accessions were read from the UniProtKB/Swiss-Prot $dbtype file\n"; | |
622 | |
623 ###################### | |
624 $dbh = DBI->connect("dbi:SQLite:$dbfile", undef, undef); | |
625 $stmth = $dbh->prepare(" | |
626 INSERT INTO UniProtKB ( | |
627 Uniprot_ID, | |
628 Description, | |
629 Organism_Name, | |
630 Organism_ID, | |
631 Gene_Name, | |
632 PE, | |
633 SV, | |
634 Sequence, | |
635 Database | |
636 ) VALUES ( | |
637 'No Uniprot_ID', | |
638 'NO_GENE_SYMBOL No Description', | |
639 'No Organism_Name', | |
640 0, | |
641 '$FAILED_MATCH_GENE_NAME', | |
642 '0', | |
643 '0', | |
644 '$FAILED_MATCH_SEQ', | |
645 'No Database' | |
646 ) | |
647 "); | |
648 if (not $stmth->execute()) { | |
649 print "Error inserting dummy row into UniProtKB: $stmth->errstr\n"; | |
650 } | |
651 $dbh->disconnect if ( defined $dbh ); | |
652 ###################### | |
653 | |
654 @timeData = localtime(time); | |
655 print "\n--- Start search at " . format_localtime_iso8601() ."\n"; | |
656 | |
657 print " --> Calling 'search_ppep' script\n\n"; | |
658 if ($verbose) { | |
659 $i = system("python $dirname/search_ppep.py -u $db_out -p $file_in --verbose"); | |
660 } else { | |
661 $i = system("python $dirname/search_ppep.py -u $db_out -p $file_in"); | |
662 } | |
663 if ($i) { | |
664 print "python $dirname/search_ppep.py -u $db_out -p $file_in\n exited with exit code $i\n"; | |
665 die "Search failed for phosphopeptides in SwissProt/SQLite file."; | |
666 } | |
667 print " <-- Returned from 'search_ppep' script\n"; | |
668 | |
669 @timeData = localtime(time); | |
670 print "... Finished search at " . format_localtime_iso8601() ."\n\n"; | |
671 | |
672 | |
673 ############################################################################################################################### | |
674 # | |
675 # Match the non_p_peptides to the @sequences array: | |
676 # 1) Format the motifs +/- 10 residues around the phospho-site | |
677 # 2) Print the original data plus the phospho-motif to the output file | |
678 # | |
679 ############################################################################################################################### | |
680 | |
681 | |
682 print "--- Match the non_p_peptides to the \@sequences array:\n"; | |
683 | |
684 if ($USE_SEARCH_PPEP_PY) { | |
685 print "Find the matching protein sequence(s) for the peptide using SQLite\n"; | |
686 } else { | |
687 print "Find the matching protein sequence(s) for the peptide using slow search\n"; | |
688 } | |
689 | |
690 # https://metacpan.org/pod/DBD::SQLite#Read-Only-Database | |
691 $dbh = DBI->connect("dbi:SQLite:$db_out", undef, undef, { | |
692 sqlite_open_flags => SQLITE_OPEN_READONLY, | |
693 }); | |
694 print "DB connection $dbh is to $db_out\n"; | |
695 | |
696 # CREATE VIEW uniprotid_pep_ppep AS | |
697 # SELECT deppep_UniProtKB.UniprotKB_ID AS accession | |
698 # , deppep.seq AS peptide | |
699 # , ppep.seq AS phosphopeptide | |
700 # , UniProtKB.Sequence AS sequence | |
701 # , UniProtKB.Description AS description | |
702 # FROM ppep, deppep, deppep_UniProtKB, UniProtKB | |
703 # WHERE deppep.id = ppep.deppep_id | |
704 # AND deppep.id = deppep_UniProtKB.deppep_id | |
705 # AND deppep_UniProtKB.UniprotKB_ID = UniProtKB.Uniprot_ID | |
706 # ORDER BY UniprotKB_ID, deppep.seq, ppep.seq; | |
707 | |
708 my %ppep_to_count_lut; | |
709 print "start select peptide counts " . format_localtime_iso8601() . "\n"; | |
710 my $uniprotkb_pep_ppep_view_stmth = $dbh->prepare(" | |
711 SELECT DISTINCT | |
712 phosphopeptide | |
713 , count(*) as i | |
714 FROM | |
715 uniprotkb_pep_ppep_view | |
716 GROUP BY | |
717 phosphopeptide | |
718 ORDER BY | |
719 phosphopeptide | |
720 "); | |
721 if (not $uniprotkb_pep_ppep_view_stmth->execute()) { | |
722 die "Error fetching peptide counts: $uniprotkb_pep_ppep_view_stmth->errstr\n"; | |
723 } | |
724 while (my @row = $uniprotkb_pep_ppep_view_stmth->fetchrow_array) { | |
725 $ppep_to_count_lut{$row[0]} = $row[1]; | |
726 #print "\$ppep_to_count_lut{$row[0]} = $ppep_to_count_lut{$row[0]}\n"; | |
727 } | |
728 | |
729 # accession, peptide, sequence, description, phosphopeptide, long_description, pos_start, pos_end, scrubbed, ppep_id | |
730 # 0 1 2 3 4 5 6 7 8 9 | |
731 my $COL_ACCESSION = 0; | |
732 my $COL_PEPTIDE = 1; | |
733 my $COL_SEQUENCE = 2; | |
734 my $COL_DESCRIPTION = 3; | |
735 my $COL_PHOSPHOPEPTIDE = 4; | |
736 my $COL_LONG_DESCRIPTION = 5; | |
737 my $COL_POS_START = 6; | |
738 my $COL_POS_END = 7; | |
739 my $COL_SCRUBBED = 8; | |
740 my $COL_PPEP_ID = 9; | |
741 | |
742 my %ppep_to_row_lut; | |
743 print "start select all records without qualification " . format_localtime_iso8601() . "\n"; | |
744 $uniprotkb_pep_ppep_view_stmth = $dbh->prepare(" | |
745 SELECT DISTINCT | |
746 accession | |
747 , peptide | |
748 , sequence | |
749 , description | |
750 , phosphopeptide | |
751 , long_description | |
752 , pos_start | |
753 , pos_end | |
754 , scrubbed | |
755 , ppep_id | |
756 FROM | |
757 uniprotkb_pep_ppep_view | |
758 ORDER BY | |
759 phosphopeptide | |
760 "); | |
761 if (not $uniprotkb_pep_ppep_view_stmth->execute()) { | |
762 die "Error fetching all records without qualification: $uniprotkb_pep_ppep_view_stmth->errstr\n"; | |
763 } | |
764 my $current_ppep; | |
765 my $counter = 0; | |
766 my $former_ppep = ""; | |
767 @tmp_matches = (); | |
768 @tmp_accessions = (); | |
769 @tmp_names = (); | |
770 @tmp_sites = (); | |
771 while (my @row = $uniprotkb_pep_ppep_view_stmth->fetchrow_array) { | |
772 # Identify phosphopeptide for current row; | |
773 # it is an error for it to change when the counter is not zero. | |
774 $current_ppep = $row[$COL_PHOSPHOPEPTIDE]; | |
775 | |
776 # when counter is zero, prepare for a new phosphopeptide | |
777 if (not $current_ppep eq $former_ppep) { | |
778 die "counter is $counter instead of zero" if ($counter != 0); | |
779 $ppep_id_lut{$current_ppep} = $row[$COL_PPEP_ID]; | |
780 print "next phosphpepetide: $current_ppep; id: $ppep_id_lut{$current_ppep}\n" if ($verbose); | |
781 $counter = $ppep_to_count_lut{$current_ppep}; | |
782 @tmp_matches = (); | |
783 @tmp_accessions = (); | |
784 @tmp_names = (); | |
785 @tmp_sites = (); | |
786 } | |
787 | |
788 if ($USE_SEARCH_PPEP_PY) { | |
789 push(@tmp_matches, $row[ $COL_SEQUENCE ]); | |
790 push(@tmp_accessions, $row[ $COL_ACCESSION ]); | |
791 push(@tmp_names, $row[ $COL_LONG_DESCRIPTION ]); | |
792 push(@tmp_sites, $row[ $COL_POS_START ]); | |
793 } | |
794 | |
795 # Prepare counter and phosphopeptide tracker for next row | |
796 $former_ppep = $current_ppep; | |
797 $counter -= 1; | |
798 | |
799 # Set trackers for later use after last instance of current phosphopeptide | |
800 if ($counter == 0) { | |
801 if ($USE_SEARCH_PPEP_PY) { | |
802 $matched_sequences{$current_ppep} = [ @tmp_matches ]; | |
803 $accessions{ $current_ppep} = [ @tmp_accessions ]; | |
804 $names{ $current_ppep} = [ @tmp_names ]; | |
805 $sites{ $current_ppep} = [ @tmp_sites ]; | |
806 } | |
807 } | |
808 } | |
809 | |
810 | |
811 print "end select all records without qualification " . format_localtime_iso8601() . "\n"; | |
812 | |
813 for my $j (0 .. $#p_peptides) { | |
814 | |
815 #Find the matching protein sequence(s) for the peptide using SQLite | |
816 my ($site, $sequence); | |
817 my (@row, @rows); | |
818 my $match = 0; | |
819 my $p_peptide = $p_peptides[$j]; | |
820 @tmp_matches = (); | |
821 @tmp_accessions = (); | |
822 @tmp_names = (); | |
823 @tmp_sites = (); | |
824 | |
825 #Find the matching protein sequence(s) for the peptide using slow search | |
826 $site = -1; | |
827 unless ($USE_SEARCH_PPEP_PY) { | |
828 for my $k (0 .. $#sequences) { | |
829 $site = index($sequences[$k], $non_p_peptides[$j]); | |
830 if ($site != -1) { | |
831 push(@tmp_matches, $sequences[$k]); | |
832 push(@tmp_accessions, $accessions[$k]); | |
833 push(@tmp_names, $names[$k]); | |
834 push(@tmp_sites, $site); | |
835 } | |
836 # print "Non-phosphpeptide $non_p_peptides[$j] matched accession $accessions[$k] ($names[$k]) at site $site\n"; | |
837 $site = -1; $match++; | |
838 # print "tmp_accessions @tmp_accessions \n"; | |
839 } | |
840 if ($match == 0) { # Check to see if no match was found. Skip to next if no match found. | |
841 print "Warning: Failed match for $p_peptides[$j]\n"; | |
842 $matched_sequences{$p_peptides[$j]} = \@failed_match; | |
843 push(@failed_matches,$p_peptides[$j]); | |
844 next; | |
845 } else { | |
846 $matched_sequences{$p_peptides[$j]} = [ @tmp_matches ]; | |
847 $accessions{$p_peptides[$j]} = [ @tmp_accessions ]; | |
848 $names{$p_peptides[$j]} = [ @tmp_names ]; | |
849 $sites{$p_peptides[$j]} = [ @tmp_sites ]; | |
850 } | |
851 } | |
852 | |
853 } # end for my $j (0 .. $#p_peptides) | |
854 | |
855 print "... Finished match the non_p_peptides at " . format_localtime_iso8601() ."\n\n"; | |
856 | |
857 print "--- Match the p_peptides to the \@sequences array:\n"; | |
858 | |
859 for my $peptide_to_match ( keys %matched_sequences ) { | |
860 if (grep($peptide_to_match, @failed_matches)) { | |
861 print "Failed to match peptide $peptide_to_match\n"; | |
862 } | |
863 next if (grep($peptide_to_match, @failed_matches)); | |
864 my @matches = @{$matched_sequences{$peptide_to_match}}; | |
865 @tmp_motifs_array = (); | |
866 for my $i (0 .. $#matches) { | |
867 | |
868 # Find the location of the phospo-site in the sequence(s) | |
869 $tmp_site = 0; my $offset = 0; | |
870 my $tmp_p_peptide = $peptide_to_match; | |
871 $tmp_p_peptide =~ s/#//g; $tmp_p_peptide =~ s/\d//g; $tmp_p_peptide =~ s/\_//g; $tmp_p_peptide =~ s/\.//g; | |
872 | |
873 # Find all phosphorylated residues in the p_peptide | |
874 @p_sites = (); | |
875 while ($tmp_site != -1) { | |
876 $tmp_site = index($tmp_p_peptide, 'p', $offset); | |
877 if ($tmp_site != -1) {push (@p_sites, $tmp_site);} | |
878 $offset = $tmp_site + 1; | |
879 $tmp_p_peptide =~ s/p//; | |
880 } | |
881 @tmp_p_residues = (); | |
882 for my $l (0 .. $#p_sites) { | |
883 next if not defined $sites{$peptide_to_match}[$i]; | |
884 | |
885 push (@tmp_p_residues, $p_sites[$l] + $sites{$peptide_to_match}[$i]); | |
886 | |
887 # Match the sequences around the phospho residues to find the motifs | |
888 my ($desired_residues_L, $desired_residues_R); | |
889 if ($tmp_p_residues[0] - 10 < 0) { #check to see if there are fewer than 10 residues left of the first p-site | |
890 # eg, XXXpYXX want $desired_residues_L = 3, $p_residues[0] = 3 | |
891 $desired_residues_L = $tmp_p_residues[0]; | |
892 } | |
893 else { | |
894 $desired_residues_L = 10; | |
895 } | |
896 my $seq_length = length($matched_sequences{$peptide_to_match}[$i]); | |
897 if ($tmp_p_residues[$#tmp_p_residues] + 10 > $seq_length) { #check to see if there are fewer than 10 residues right of the last p-site | |
898 $desired_residues_R = $seq_length - ($tmp_p_residues[$#tmp_p_residues] + 1); | |
899 # eg, XXXpYXX want $desired_residues_R = 2, $seq_length = 6, $p_residues[$#p_residues] = 3 | |
900 # print "Line 170: seq_length = $seq_length\tp_residue = $p_residues[$#p_residues]\n"; | |
901 } | |
902 else { | |
903 $desired_residues_R = 10; | |
904 } | |
905 | |
906 my $total_length = $desired_residues_L + $tmp_p_residues[$#tmp_p_residues] - $tmp_p_residues[0] + $desired_residues_R + 1; | |
907 my $arg2 = $tmp_p_residues[0] - $desired_residues_L; | |
908 my $arg1 = $matched_sequences{$peptide_to_match}[$i]; | |
909 | |
910 if (($total_length > 0) && (length($arg1) > $arg2 + $total_length - 1)) { | |
911 $tmp_motif = substr($arg1, $arg2, $total_length); | |
912 | |
913 # Put the "p" back in front of the appropriate phospho-residue(s). | |
914 my (@tmp_residues, $tmp_position); | |
915 for my $m (0 .. $#p_sites) { | |
916 # print "Line 183: $p_sites[$m]\n"; | |
917 if ($m == 0) { | |
918 $tmp_position = $desired_residues_L; | |
919 } else { | |
920 $tmp_position = $desired_residues_L + $p_sites[$m] - $p_sites[0]; | |
921 } | |
922 if ($tmp_position < length($tmp_motif) + 1) { | |
923 push (@tmp_residues, substr($tmp_motif, $tmp_position, 1)); | |
924 if ($tmp_residues[$m] eq "S") {substr($tmp_motif, $tmp_position, 1, "s");} | |
925 if ($tmp_residues[$m] eq "T") {substr($tmp_motif, $tmp_position, 1, "t");} | |
926 if ($tmp_residues[$m] eq "Y") {substr($tmp_motif, $tmp_position, 1, "y");} | |
927 } | |
928 } | |
929 | |
930 $tmp_motif =~ s/s/pS/g; $tmp_motif =~ s/t/pT/g; $tmp_motif =~ s/y/pY/g; | |
931 | |
932 # Comment out on 8.10.13 to remove the numbers from motifs | |
933 my $left_residue = $tmp_p_residues[0] - $desired_residues_L+1; | |
934 my $right_residue = $tmp_p_residues[$#tmp_p_residues] + $desired_residues_R+1; | |
935 $tmp_motif = $left_residue."-[ ".$tmp_motif." ]-".$right_residue; | |
936 push(@tmp_motifs_array, $tmp_motif); | |
937 $residues{$peptide_to_match}{$i} = [ @tmp_residues ]; | |
938 $p_residues{$peptide_to_match}{$i} = [ @tmp_p_residues ]; | |
939 } | |
940 } | |
941 $p_motifs{$peptide_to_match} = [ @tmp_motifs_array ]; | |
942 } # end for my $i (0 .. $#matches) ### this bracket could be in the wrong place | |
943 } | |
944 | |
945 print "... Finished match the p_peptides to the \@sequences array at " . format_localtime_iso8601() ."\n\n"; | |
946 | |
947 ############################################################################################################################### | |
948 # | |
949 # Annotate the peptides with the NetworKIN predictions and HPRD / Phosida kinase motifs | |
950 # | |
951 ############################################################################################################################### | |
952 | |
953 | |
954 print "--- Reading various site data:\n"; | |
955 | |
956 ############################################################################################################################### | |
957 # | |
958 # Read the NetworKIN_predictions file: | |
959 # 1) make a "kinases_observed" array | |
960 # 2) annotate the phospho-substrates with the appropriate kinase | |
961 # | |
962 ############################################################################################################################### | |
963 my $SITE_KINASE_SUBSTRATE = 1; | |
964 $site_description{$SITE_KINASE_SUBSTRATE} = "NetworKIN"; | |
965 | |
966 open (IN1, "$networkin_in") or die "I couldn't find $networkin_in\n"; | |
967 print "Reading the NetworKIN data: $networkin_in\n"; | |
968 while (<IN1>) { | |
969 chomp; | |
970 my (@x) = split(/\t/); | |
971 for my $i (0 .. $#x) { | |
972 $x[$i] =~ s/\r//g; $x[$i] =~ s/\n//g; $x[$i] =~ s/\"//g; | |
973 } | |
974 next if ($x[0] eq "#substrate"); | |
975 if (exists ($kinases -> {$x[2]})) { | |
976 #do nothing | |
977 } | |
978 else { | |
979 $kinases -> {$x[2]} = $x[2]; | |
980 push (@kinases_observed, $x[2]); | |
981 } | |
982 my $tmp = $x[10]."_".$x[2]; #eg, REEILsEMKKV_PKCalpha | |
983 if (exists($p_sequence_kinase -> {$tmp})) { | |
984 #do nothing | |
985 } | |
986 else { | |
987 $p_sequence_kinase -> {$tmp} = $tmp; | |
988 } | |
989 } | |
990 close IN1; | |
991 | |
992 ############################################################################################################################### | |
993 # | |
994 # Read the Kinase motifs file: | |
995 # 1) make a "motif_sequence" array | |
996 # | |
997 ############################################################################################################################### | |
998 | |
999 # file format (tab separated): | |
1000 # x[0] = quasi-primary key (character), e.g., '17' or '23a' | |
1001 # x[1] = pattern (egrep pattern), e.g., '(M|I|L|V|F|Y).R..(pS|pT)' | |
1002 # x[2] = description, e.g., 'PKA_Phosida' or '14-3-3 domain binding motif (HPRD)' or 'Akt kinase substrate motif (HPRD & Phosida)' | |
1003 # "counter" "pcre" "symbol" "description" "pubmed_id" "classification" "source" | |
1004 # "1" "R.R..(pS|pT)(F|L)" "PKB_group" "Akt kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=8985174" "kinase substrate" "HPRD" | |
1005 # x[3] = old description, i.e., description in Amanchy (HPRD) and Phosida tables | |
1006 # x[4] = pubmed id | |
1007 # x[5] = classification | |
1008 # x[6] = source (Phosida or HPRD) | |
1009 my $SITE_HPRD = 2; | |
1010 $site_description{$SITE_HPRD} = "HPRD"; | |
1011 $site_id{$site_description{$SITE_HPRD}} = $SITE_HPRD; | |
1012 my $SITE_PHOSIDA = 4; | |
1013 $site_description{$SITE_PHOSIDA} = "Phosida"; | |
1014 $site_id{$site_description{$SITE_PHOSIDA}} = $SITE_PHOSIDA; | |
1015 | |
1016 open (IN2, "$motifs_in") or die "I couldn't find $motifs_in\n"; | |
1017 print "Reading the Motifs file: $motifs_in\n"; | |
1018 | |
1019 while (<IN2>) { | |
1020 chomp; | |
1021 my (@x) = split(/\t/); | |
1022 my $tmp_motif_description; | |
1023 if ($#x == 6) { # weirdly, a @list of length seven has $#list == 6 | |
1024 # remove double-quotes which are helpful or necessary for Excel | |
1025 $x[6] =~ s/\"//g; | |
1026 $tmp_motif_description = $x[6]; | |
1027 } else { | |
1028 $tmp_motif_description = "motif"; | |
1029 } | |
1030 for my $i (0 .. 2) { | |
1031 # remove any embedded CR or LF (none should exist) | |
1032 $x[$i] =~ s/\r//g; | |
1033 $x[$i] =~ s/\n//g; | |
1034 # remove double-quotes which are helpful or necessary for Excel | |
1035 $x[$i] =~ s/\"//g; | |
1036 } | |
1037 if (exists ($motif_type{$x[2]})) { | |
1038 #ACE-2022.06.20 $motif_type{$x[1]} = $motif_type{$x[1]}." & ".$x[2]; | |
1039 $motif_type{$x[2]} = $motif_type{$x[2]}."|".$x[2]; | |
1040 } else { | |
1041 $motif_type{$x[2]} = $x[2]; | |
1042 $motif_count{$x[1]} = 0; | |
1043 push (@motif_sequence, $x[1]); | |
1044 push (@motif_description, $tmp_motif_description); | |
1045 push (@motif_type_key_ary, $x[2]) | |
1046 } | |
1047 } | |
1048 close (IN2); | |
1049 | |
1050 | |
1051 ############################################################################################################################### | |
1052 # 6.28.2011 | |
1053 # Read PSP_Kinase_Substrate data: | |
1054 # 1) make a "kinases_PhosphoSite" array | |
1055 # 2) annotate the phospho-substrates with the appropriate kinase | |
1056 # | |
1057 # Columns: | |
1058 # (0) GENE | |
1059 # (1) KINASE | |
1060 # (2) KIN_ACC_ID | |
1061 # (3) KIN_ORGANISM | |
1062 # (4) SUBSTRATE | |
1063 # (5) SUB_GENE_ID | |
1064 # (6) SUB_ACC_ID | |
1065 # (7) SUB_GENE | |
1066 # (8) SUB_ORGANISM | |
1067 # (9) SUB_MOD_RSD | |
1068 # (10) SITE_GRP_ID | |
1069 # (11) SITE_+/-7_AA | |
1070 # (12) DOMAIN | |
1071 # (13) IN_VIVO_RXN | |
1072 # (14) IN_VITRO_RXN | |
1073 # (15) CST_CAT# | |
1074 ############################################################################################################################### | |
1075 | |
1076 my $SITE_PHOSPHOSITE = 3; | |
1077 $site_description{$SITE_PHOSPHOSITE} = "PhosphoSite"; | |
1078 | |
1079 | |
1080 $line = 0; | |
1081 | |
1082 open (IN3, "$PSP_Kinase_Substrate_in") or die "I couldn't find $PSP_Kinase_Substrate_in\n"; | |
1083 print "Reading the PhosphoSite Kinase-Substrate data: $PSP_Kinase_Substrate_in\n"; | |
1084 | |
1085 while (<IN3>) { | |
1086 chomp; | |
1087 my (@x) = split(/\t/); | |
1088 for my $i (0 .. $#x) { | |
1089 $x[$i] =~ s/\r//g; $x[$i] =~ s/\n//g; $x[$i] =~ s/\"//g; | |
1090 } | |
1091 if ($line != 0) { | |
1092 if (($species eq $x[3]) && ($species eq $x[8])) { | |
1093 if (exists ($kinases_PhosphoSite -> {$x[0]})) { | |
1094 #do nothing | |
1095 } | |
1096 else { | |
1097 $kinases_PhosphoSite -> {$x[0]} = $x[0]; | |
1098 push (@kinases_PhosphoSite, $x[0]); | |
1099 } | |
1100 my $offset = 0; | |
1101 # Replace the superfluous lower case s, t and y | |
1102 my @lowercase = ('s','t','y'); | |
1103 my @uppercase = ('S','T','Y'); | |
1104 for my $k (0 .. 2) { | |
1105 my $site = 0; | |
1106 while ($site != -1) { | |
1107 $site = index($x[11],$lowercase[$k], $offset); | |
1108 if (($site != 7) && ($site != -1)) {substr($x[11], $site, 1, $uppercase[$k]);} | |
1109 $offset = $site + 1; | |
1110 } | |
1111 } | |
1112 my $tmp = $x[11]."_".$x[0]; #eg, RTPGRPLsSYGMDSR_PAK2 | |
1113 if (exists($p_sequence_kinase_PhosphoSite -> {$tmp})) { | |
1114 #do nothing | |
1115 } | |
1116 else { | |
1117 $p_sequence_kinase_PhosphoSite -> {$tmp} = $tmp; | |
1118 } | |
1119 } | |
1120 else { | |
1121 # do nothing | |
1122 #print "PSP_kinase_substrate line rejected because KIN_ORGANISM is '$x[3]' and SUB_ORGANISM is '$x[8]': $line\n"; | |
1123 } | |
1124 } | |
1125 $line++; | |
1126 } | |
1127 close IN3; | |
1128 | |
1129 | |
1130 ############################################################################################################################### | |
1131 # Read PhosphoSite regulatory site data: | |
1132 # 1) make a "regulatory_sites_PhosphoSite" hash | |
1133 # | |
1134 # Columns: | |
1135 # (0) GENE | |
1136 # (2) PROT_TYPE | |
1137 # (3) ACC_ID | |
1138 # (4) GENE_ID | |
1139 # (5) HU_CHR_LOC | |
1140 # (6) ORGANISM --> %organism | |
1141 # (7) MOD_RSD | |
1142 # (8) SITE_GRP_ID | |
1143 # (9) SITE_+/-7_AA --> %regulatory_sites_PhosphoSite_hash | |
1144 # (10) DOMAIN --> %domain | |
1145 # (11) ON_FUNCTION --> %ON_FUNCTION | |
1146 # (12) ON_PROCESS --> %ON_PROCESS | |
1147 # (13) ON_PROT_INTERACT --> %ON_PROT_INTERACT | |
1148 # (14) ON_OTHER_INTERACT --> %ON_OTHER_INTERACT | |
1149 # (15) PMIDs | |
1150 # (16) LT_LIT | |
1151 # (17) MS_LIT | |
1152 # (18) MS_CST | |
1153 # (19) NOTES --> %notes | |
1154 ############################################################################################################################### | |
1155 | |
1156 | |
1157 $dbh = DBI->connect("dbi:SQLite:$db_out", undef, undef); | |
1158 my $auto_commit = $dbh->{AutoCommit}; | |
1159 $dbh->{AutoCommit} = 0; | |
1160 print "DB connection $dbh is to $db_out, opened for modification\n"; | |
1161 | |
1162 # add partial PSP_Regulatory_site table (if not exists) regardless of whether SwissProt input was FASTA or SQLite | |
1163 $stmth = $dbh->prepare(" | |
1164 CREATE TABLE IF NOT EXISTS PSP_Regulatory_site ( | |
1165 SITE_PLUSMINUS_7AA TEXT PRIMARY KEY ON CONFLICT IGNORE, | |
1166 DOMAIN TEXT, | |
1167 ON_FUNCTION TEXT, | |
1168 ON_PROCESS TEXT, | |
1169 ON_PROT_INTERACT TEXT, | |
1170 ON_OTHER_INTERACT TEXT, | |
1171 NOTES TEXT, | |
1172 ORGANISM TEXT, | |
1173 PROTEIN TEXT | |
1174 ) | |
1175 "); | |
1176 $stmth->execute(); | |
1177 | |
1178 # add partial PSP_Regulatory_site LUT (if not exists) regardless of whether SwissProt input was FASTA or SQLite | |
1179 $stmth = $dbh->prepare(" | |
1180 CREATE TABLE IF NOT EXISTS ppep_regsite_LUT | |
1181 ( ppep_id INTEGER REFERENCES ppep(id) | |
1182 , site_plusminus_7AA TEXT REFERENCES PSP_Regulatory_site(site_plusminus_7AA) | |
1183 , PRIMARY KEY (ppep_id, site_plusminus_7AA) ON CONFLICT IGNORE | |
1184 ); | |
1185 "); | |
1186 $stmth->execute(); | |
1187 | |
1188 # $stmth = $dbh->prepare(" | |
1189 # CREATE UNIQUE INDEX idx_PSP_Regulatory_site_0 | |
1190 # ON PSP_Regulatory_site(site_plusminus_7AA); | |
1191 # "); | |
1192 # $stmth->execute(); | |
1193 | |
1194 | |
1195 # add Citation table (if not exists) regardless of whether SwissProt input was FASTA or SQLite | |
1196 my $citation_sql; | |
1197 $citation_sql = " | |
1198 CREATE TABLE IF NOT EXISTS Citation ( | |
1199 ObjectName TEXT REFERENCES sqlite_schema(name) ON DELETE CASCADE, | |
1200 CitationData TEXT, | |
1201 PRIMARY KEY (ObjectName, CitationData) ON CONFLICT IGNORE | |
1202 ) | |
1203 "; | |
1204 $stmth = $dbh->prepare($citation_sql); | |
1205 $stmth->execute(); | |
1206 | |
1207 | |
1208 open (IN4, "$PSP_Regulatory_Sites_in") or die "I couldn't find $PSP_Regulatory_Sites_in\n"; | |
1209 print "Reading the PhosphoSite regulatory site data: $PSP_Regulatory_Sites_in\n"; | |
1210 | |
1211 | |
1212 $line = -1; | |
1213 while (<IN4>) { | |
1214 $line++; | |
1215 chomp; | |
1216 if ($_ =~ m/PhosphoSitePlus/) { | |
1217 #$PhosphoSitePlusCitation = ($_ =~ s/PhosphoSitePlus/FooBar/g); | |
1218 $PhosphoSitePlusCitation = $_; | |
1219 $PhosphoSitePlusCitation =~ s/\t//g; | |
1220 $PhosphoSitePlusCitation =~ s/\r//g; | |
1221 $PhosphoSitePlusCitation =~ s/\n//g; | |
1222 $PhosphoSitePlusCitation =~ s/""/"/g; | |
1223 $PhosphoSitePlusCitation =~ s/^"//g; | |
1224 $PhosphoSitePlusCitation =~ s/"$//g; | |
1225 print "$PhosphoSitePlusCitation\n"; | |
1226 next; | |
1227 } | |
1228 my (@x) = split(/\t/); | |
1229 for my $i (0 .. $#x) { | |
1230 $x[$i] =~ s/\r//g; $x[$i] =~ s/\n//g; $x[$i] =~ s/\"//g; | |
1231 } | |
1232 my $found_GENE=0; | |
1233 if ( (not exists($x[0])) ) { | |
1234 next; | |
1235 } | |
1236 elsif ( ($x[0] eq "GENE") ) { | |
1237 $found_GENE=1; | |
1238 next; | |
1239 } | |
1240 if ( (not exists($x[9])) || ($x[9] eq "") ) { | |
1241 if (exists($x[8]) && (not $x[8] eq "")) { | |
1242 die "$PSP_Regulatory_Sites_in line $line has no SITE_+/-7_AA: $_\n"; | |
1243 } else { | |
1244 if ( (not exists($x[1])) || (not $x[1] eq "") ) { | |
1245 print "$PSP_Regulatory_Sites_in line $line (".length($_)." characters) has no SITE_+/-7_AA: $_\n" | |
1246 if $found_GENE==1; | |
1247 } | |
1248 next; | |
1249 } | |
1250 } | |
1251 elsif ($line != 0) { | |
1252 if ($species ne $x[6]) { | |
1253 # Do nothing - this record was filtered out by the species filter | |
1254 } | |
1255 elsif (!exists($regulatory_sites_PhosphoSite_hash{$x[9]})) { | |
1256 if (!defined $domain{$x[9]} || $domain{$x[9]} eq "") { | |
1257 $regulatory_sites_PhosphoSite_hash{$x[9]} = $x[9]; | |
1258 $domain{$x[9]} = $x[10]; | |
1259 $ON_FUNCTION{$x[9]} = $x[11]; | |
1260 $ON_PROCESS{$x[9]} = $x[12]; | |
1261 $ON_PROT_INTERACT{$x[9]} = $x[13]; | |
1262 $ON_OTHER_INTERACT{$x[9]} = $x[14]; | |
1263 $notes{$x[9]} = $x[19]; | |
1264 $organism{$x[9]} = $x[6]; | |
1265 } | |
1266 } | |
1267 else { | |
1268 # $domain | |
1269 if (!defined $domain{$x[9]} || $domain{$x[9]} eq "") { | |
1270 if ($x[10] ne "") { | |
1271 $domain{$x[9]} = $domain{$x[10]}; | |
1272 } | |
1273 else { | |
1274 # do nothing | |
1275 } | |
1276 } | |
1277 else { | |
1278 if ($domain{$x[9]} =~ /$x[10]/) { | |
1279 # do nothing | |
1280 } | |
1281 else { | |
1282 $domain{$x[9]} = $domain{$x[9]}." / ".$x[10]; | |
1283 #print "INFO line $line - compound domain for 7aa: GENE $x[0] PROTEIN $x[1] PROT_TYPE $x[2] ACC_ID $x[3] GENE_ID $x[4] HU_CHR_LOC $x[5] ORGANISM $x[6] MOD_RSD $x[7] SITE_GRP_ID $x[8] SITE_+/-7_AA $x[9] DOMAIN $domain{$x[9]}\n"; | |
1284 } | |
1285 } | |
1286 | |
1287 # $ON_FUNCTION | |
1288 if (!defined $ON_FUNCTION{$x[9]} || $ON_FUNCTION{$x[9]} eq "") { | |
1289 $ON_FUNCTION{$x[9]} = $ON_FUNCTION{$x[10]}; | |
1290 } elsif ($x[10] eq "") { | |
1291 # do nothing | |
1292 } | |
1293 else { | |
1294 $ON_FUNCTION{$x[9]} = $ON_FUNCTION{$x[9]}." / ".$x[10]; | |
1295 } | |
1296 | |
1297 # $ON_PROCESS | |
1298 if (!defined $ON_PROCESS{$x[9]} || $ON_PROCESS{$x[9]} eq "") { | |
1299 $ON_PROCESS{$x[9]} = $ON_PROCESS{$x[10]}; | |
1300 } elsif ($x[10] eq "") { | |
1301 # do nothing | |
1302 } | |
1303 else { | |
1304 $ON_PROCESS{$x[9]} = $ON_PROCESS{$x[9]}." / ".$x[10]; | |
1305 } | |
1306 | |
1307 # $ON_PROT_INTERACT | |
1308 if (!defined $ON_PROT_INTERACT{$x[9]} || $ON_PROT_INTERACT{$x[9]} eq "") { | |
1309 $ON_PROT_INTERACT{$x[9]} = $ON_PROT_INTERACT{$x[10]}; | |
1310 } elsif ($x[10] eq "") { | |
1311 # do nothing | |
1312 } | |
1313 else { | |
1314 $ON_PROT_INTERACT{$x[9]} = $ON_PROT_INTERACT{$x[9]}." / ".$x[10]; | |
1315 } | |
1316 | |
1317 # $ON_OTHER_INTERACT | |
1318 if (!defined $ON_OTHER_INTERACT{$x[9]} || $ON_OTHER_INTERACT{$x[9]} eq "") { | |
1319 $ON_OTHER_INTERACT{$x[9]} = $ON_OTHER_INTERACT{$x[10]}; | |
1320 } elsif ($x[10] eq "") { | |
1321 # do nothing | |
1322 } | |
1323 else { | |
1324 $ON_OTHER_INTERACT{$x[9]} = $ON_OTHER_INTERACT{$x[9]}." / ".$x[10]; | |
1325 } | |
1326 | |
1327 # $notes | |
1328 if (!defined $notes{$x[9]} || $notes{$x[9]} eq "") { | |
1329 $notes{$x[9]} = $notes{$x[10]}; | |
1330 } elsif ($x[10] eq "") { | |
1331 # do nothing | |
1332 } | |
1333 else { | |
1334 $notes{$x[9]} = $notes{$x[9]}." / ".$x[10]; | |
1335 } | |
1336 | |
1337 # $organism | |
1338 if (!defined $organism{$x[9]} || $organism{$x[9]} eq "") { | |
1339 $organism{$x[9]} = $organism{$x[10]}; | |
1340 } elsif ($x[10] eq "") { | |
1341 # do nothing | |
1342 } | |
1343 else { | |
1344 $organism{$x[9]} = $organism{$x[9]}." / ".$x[10]; | |
1345 } | |
1346 } | |
1347 } | |
1348 } | |
1349 close IN4; | |
1350 | |
1351 print "... Finished reading various site data at " . format_localtime_iso8601() ."\n\n"; | |
1352 | |
1353 $stmth = $dbh->prepare(" | |
1354 INSERT INTO Citation ( | |
1355 ObjectName, | |
1356 CitationData | |
1357 ) VALUES (?,?) | |
1358 "); | |
1359 | |
1360 sub add_citation { | |
1361 my ($cit_table, $cit_text, $cit_label) = @_; | |
1362 $stmth->bind_param(1, $cit_table); | |
1363 $stmth->bind_param(2, $cit_text); | |
1364 if (not $stmth->execute()) { | |
1365 print "Error writing $cit_label cit for table $cit_table: $stmth->errstr\n"; | |
1366 } | |
1367 } | |
1368 my ($citation_text, $citation_table); | |
1369 | |
1370 # PSP regulatory or kinase/substrate site | |
1371 $citation_text = 'PhosphoSitePlus(R) (PSP) was created by Cell Signaling Technology Inc. It is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License. When using PSP data or analyses in printed publications or in online resources, the following acknowledgements must be included: (a) the words "PhosphoSitePlus(R), www.phosphosite.org" must be included at appropriate places in the text or webpage, and (b) the following citation must be included in the bibliography: "Hornbeck PV, Zhang B, Murray B, Kornhauser JM, Latham V, Skrzypek E PhosphoSitePlus, 2014: mutations, PTMs and recalibrations. Nucleic Acids Res. 2015 43:D512-20. PMID: 25514926."'; | |
1372 $citation_table = "PSP_Regulatory_site"; | |
1373 add_citation($citation_table, $citation_text, "PSP_Kinase_Substrate"); | |
1374 $citation_table = "psp_gene_site"; | |
1375 add_citation($citation_table, $citation_text, "PSP_Kinase_Substrate"); | |
1376 $citation_table = "psp_gene_site_view"; | |
1377 add_citation($citation_table, $citation_text, "PSP_Regulatory_site"); | |
1378 $citation_text = 'Hornbeck, 2014, "PhosphoSitePlus, 2014: mutations, PTMs and recalibrations.", https://pubmed.ncbi.nlm.nih.gov/22135298, https://doi.org/10.1093/nar/gkr1122'; | |
1379 $citation_table = "PSP_Regulatory_site"; | |
1380 add_citation($citation_table, $citation_text, "PSP_Regulatory_site"); | |
1381 $citation_table = "psp_gene_site"; | |
1382 add_citation($citation_table, $citation_text, "PSP_Kinase_Substrate"); | |
1383 $citation_table = "psp_gene_site_view"; | |
1384 add_citation($citation_table, $citation_text, "PSP_Kinase_Substrate"); | |
1385 | |
1386 # NetworKIN site | |
1387 $citation_text = 'Linding, 2007, "Systematic discovery of in vivo phosphorylation networks.", https://pubmed.ncbi.nlm.nih.gov/17570479, https://doi.org/10.1016/j.cell.2007.05.052'; | |
1388 $citation_table = "psp_gene_site"; | |
1389 add_citation($citation_table, $citation_text, "NetworkKIN"); | |
1390 $citation_table = "psp_gene_site_view"; | |
1391 add_citation($citation_table, $citation_text, "NetworkKIN"); | |
1392 $citation_text = 'Horn, 2014, "KinomeXplorer: an integrated platform for kinome biology studies.", https://pubmed.ncbi.nlm.nih.gov/24874572, https://doi.org/10.1038/nmeth.296'; | |
1393 $citation_table = "psp_gene_site"; | |
1394 add_citation($citation_table, $citation_text, "NetworkKIN"); | |
1395 $citation_table = "psp_gene_site_view"; | |
1396 add_citation($citation_table, $citation_text, "NetworkKIN"); | |
1397 $citation_text = 'Aken, 2016, "The Ensembl gene annotation system.", https://pubmed.ncbi.nlm.nih.gov/33137190, https://doi.org/10.1093/database/baw093'; | |
1398 $citation_table = "psp_gene_site"; | |
1399 add_citation($citation_table, $citation_text, "NetworkKIN"); | |
1400 $citation_table = "psp_gene_site_view"; | |
1401 add_citation($citation_table, $citation_text, "NetworkKIN"); | |
1402 | |
1403 # pSTY motifs | |
1404 $citation_text = 'Amanchy, 2007, "A curated compendium of phosphorylation motifs.", https://pubmed.ncbi.nlm.nih.gov/17344875, https://doi.org/10.1038/nbt0307-285'; | |
1405 $citation_table = "psp_gene_site"; | |
1406 add_citation($citation_table, $citation_text, "Amanchy_pSTY_motifs"); | |
1407 $citation_table = "psp_gene_site_view"; | |
1408 add_citation($citation_table, $citation_text, "Amanchy_pSTY_motifs"); | |
1409 $citation_text = 'Gnad, 2011, "PHOSIDA 2011: the posttranslational modification database.", https://pubmed.ncbi.nlm.nih.gov/21081558, https://doi.org/10.1093/nar/gkq1159'; | |
1410 $citation_table = "psp_gene_site"; | |
1411 add_citation($citation_table, $citation_text, "Phosida_pSTY_motifs"); | |
1412 $citation_table = "psp_gene_site_view"; | |
1413 add_citation($citation_table, $citation_text, "Phosida_pSTY_motifs"); | |
1414 | |
1415 | |
1416 ############################################################################################################################### | |
1417 # | |
1418 # Read the data file: | |
1419 # 1) find sequences that match the NetworKIN predictions | |
1420 # 2) find motifs that match the observed sequences | |
1421 # | |
1422 ############################################################################################################################### | |
1423 | |
1424 print "--- Find sequences that match the NetworKIN predictions and find motifs that match observed sequences\n"; | |
1425 | |
1426 my $ppep_regsite_LUT_stmth; | |
1427 $ppep_regsite_LUT_stmth = $dbh->prepare(" | |
1428 INSERT INTO ppep_regsite_LUT ( | |
1429 ppep_id, | |
1430 site_plusminus_7AA | |
1431 ) VALUES (?,?) | |
1432 "); | |
1433 | |
1434 my ($start_seconds, $start_microseconds) = gettimeofday; | |
1435 | |
1436 foreach my $peptide (keys %data) { | |
1437 # find the unique phospho-motifs for this $peptide | |
1438 my @all_motifs = (); | |
1439 my $have_all_motifs = 0; | |
1440 for my $i (0 .. $#{ $matched_sequences{$peptide} } ) { | |
1441 my $tmp_motif = $p_motifs{$peptide}[$i]; | |
1442 push(@all_motifs, $tmp_motif); | |
1443 $have_all_motifs = 1; | |
1444 } | |
1445 if ($have_all_motifs == 1) { | |
1446 for my $j (0 .. $#all_motifs) { | |
1447 if (defined $all_motifs[$j]) { | |
1448 $all_motifs[$j] =~ s/\d+-\[\s//; | |
1449 $all_motifs[$j] =~ s/\s\]\-\d+//; | |
1450 } | |
1451 } | |
1452 } | |
1453 my %seen = (); | |
1454 if ($have_all_motifs == 1) { | |
1455 foreach my $a (@all_motifs) { | |
1456 if (defined $a) { | |
1457 if (exists($seen{$a})) { | |
1458 next; | |
1459 } else { | |
1460 push(@{$unique_motifs{$peptide}}, $a); | |
1461 $seen{$a} = 1; | |
1462 } | |
1463 } | |
1464 print "push(\@{\$unique_motifs{$peptide}}, $a);\n" if ($verbose); | |
1465 } | |
1466 } | |
1467 | |
1468 # count the number of phospo-sites in the motif | |
1469 my $number_pY = 0; | |
1470 my $number_pSTY = 0; | |
1471 if ($phospho_type eq 'y') { | |
1472 if (defined(${$unique_motifs{$peptide}}[0])) { | |
1473 while (${$unique_motifs{$peptide}}[0] =~ /pY/g) { | |
1474 $number_pY++; | |
1475 } | |
1476 } | |
1477 } | |
1478 if ($phospho_type eq 'sty') { | |
1479 print "looking for unique_motifs for $peptide\n" if ($verbose); | |
1480 if (defined(${$unique_motifs{$peptide}}[0])) { | |
1481 while (${$unique_motifs{$peptide}}[0] =~ /(pS|pT|pY)/g) { | |
1482 $number_pSTY++; | |
1483 print "We have found $number_pSTY unique_motifs for $peptide\n" if ($verbose); | |
1484 } | |
1485 } | |
1486 } | |
1487 | |
1488 | |
1489 # search each of the unique motifs for matches | |
1490 print "searching $#{$unique_motifs{$peptide}} motifs for peptide $peptide\n" if ($verbose); | |
1491 for my $i (0 .. $#{$unique_motifs{$peptide}}) { | |
1492 print "\$i = $i; peptide = $peptide; unique_motif = ${$unique_motifs{$peptide}}[$i]\n" if ($verbose); | |
1493 my $tmp_motif = ${$unique_motifs{$peptide}}[$i]; | |
1494 print " --- matching unique motif $tmp_motif for peptide $peptide at " . format_localtime_iso8601() ."\n" if ($verbose); | |
1495 my $formatted_sequence; | |
1496 if (($number_pY == 1) || ($number_pSTY == 1)) { | |
1497 my $seq_plus5aa = ""; | |
1498 my $seq_plus7aa = ""; | |
1499 $formatted_sequence = &replace_pSpTpY($tmp_motif, $phospho_type); | |
1500 print " a #pY $number_pY; #pSTY $number_pSTY; matching formatted motif $formatted_sequence for peptide $peptide at " . format_localtime_iso8601() ."\n" if ($verbose); | |
1501 if ($phospho_type eq 'y') { | |
1502 $seq_plus5aa = (split(/(\w{0,5}y\w{0,5})/, $formatted_sequence))[1]; | |
1503 $seq_plus7aa = (split(/(\w{0,7}y\w{0,7})/, $formatted_sequence))[1]; | |
1504 } | |
1505 elsif ($phospho_type eq "sty") { | |
1506 $seq_plus5aa = (split(/(\w{0,5}(s|t|y)\w{0,5})/, $formatted_sequence))[1]; | |
1507 $seq_plus7aa = (split(/(\w{0,7}(s|t|y)\w{0,7})/, $formatted_sequence))[1]; | |
1508 } | |
1509 | |
1510 if (defined $seq_plus7aa) { | |
1511 # commit the 7aa LUT records | |
1512 $ppep_regsite_LUT_stmth->bind_param( 1, $ppep_id_lut{$peptide} ); | |
1513 $ppep_regsite_LUT_stmth->bind_param( 2, $seq_plus7aa ); | |
1514 if (not $ppep_regsite_LUT_stmth->execute()) { | |
1515 print "Error writing tuple ($ppep_id_lut{$peptide},$seq_plus7aa) for peptide $peptide to ppep_regsite_LUT: $ppep_regsite_LUT_stmth->errstr\n"; | |
1516 } | |
1517 } | |
1518 for my $i (0 .. $#kinases_observed) { | |
1519 if (defined $seq_plus5aa) { | |
1520 my $tmp = $seq_plus5aa."_".$kinases_observed[$i]; #eg, should be PGRPLsSYGMD_PKCalpha | |
1521 if (exists($p_sequence_kinase -> {$tmp})) { | |
1522 $kinase_substrate_NetworKIN_matches{$peptide}{$kinases_observed[$i]} = "X"; #ACE | |
1523 } | |
1524 } | |
1525 } | |
1526 for my $i (0 .. $#motif_sequence) { | |
1527 print "matching $motif_sequence[$i]" if ($verbose); | |
1528 if ($peptide =~ /$motif_sequence[$i]/) { | |
1529 $kinase_motif_matches{$peptide}{$motif_type{$motif_type_key_ary[$i]}} = "X"; | |
1530 } | |
1531 } | |
1532 for my $i (0 .. $#kinases_PhosphoSite) { | |
1533 if (defined $seq_plus7aa) { | |
1534 my $tmp = $seq_plus7aa."_".$kinases_PhosphoSite[$i]; #eg, should be RTPGRPLsSYGMDSR_PAK2 | |
1535 if (exists($p_sequence_kinase_PhosphoSite -> {$tmp})) { | |
1536 $kinase_substrate_PhosphoSite_matches{$peptide}{$kinases_PhosphoSite[$i]} = "X"; | |
1537 } | |
1538 } | |
1539 } | |
1540 if (exists($regulatory_sites_PhosphoSite_hash{$seq_plus7aa})) { | |
1541 $seq_plus7aa_2{$peptide} = $seq_plus7aa; | |
1542 $domain_2{$peptide} = $domain{$seq_plus7aa}; | |
1543 $ON_FUNCTION_2{$peptide} = $ON_FUNCTION{$seq_plus7aa}; | |
1544 $ON_PROCESS_2{$peptide} = $ON_PROCESS{$seq_plus7aa}; | |
1545 $ON_PROT_INTERACT_2{$peptide} = $ON_PROT_INTERACT{$seq_plus7aa}; | |
1546 $ON_OTHER_INTERACT_2{$peptide} = $ON_OTHER_INTERACT{$seq_plus7aa}; | |
1547 $notes_2{$peptide} = $notes{$seq_plus7aa}; | |
1548 $organism_2{$peptide} = $organism{$seq_plus7aa}; | |
1549 } else { | |
1550 } | |
1551 } | |
1552 elsif (($number_pY > 1) || ($number_pSTY > 1)) { #eg, if $x[4] is 1308-[ VIYFQAIEEVpYpYDHLRSAAKKR ]-1329 and $number_pY == 2 | |
1553 $formatted_sequence = $tmp_motif; | |
1554 $seq_plus5aa = ""; | |
1555 $seq_plus7aa = ""; | |
1556 #Create the sequences with only one phosphorylation site | |
1557 #eg, 1308-[ VIYFQAIEEVpYpYDHLRSAAKKR ]-1329, which becomes 1308-[ VIYFQAIEEVpYYDHLRSAAKKR ]-1329 and 1308-[ VIYFQAIEEVYpYDHLRSAAKKR ]-1329 | |
1558 | |
1559 my (@sites, $offset, $next_p_site); | |
1560 $sites[0] = index($tmp_motif, "p"); | |
1561 $offset = $sites[0] + 1; | |
1562 $next_p_site = 0; | |
1563 while ($next_p_site != -1) { | |
1564 $next_p_site = index($tmp_motif, "p", $offset); | |
1565 if ($next_p_site != -1) { | |
1566 push (@sites, $next_p_site); | |
1567 } | |
1568 $offset = $next_p_site+1; | |
1569 } | |
1570 | |
1571 my @pSTY_sequences; | |
1572 for my $n (0 .. $#sites) { | |
1573 $pSTY_sequences[$n] = $tmp_motif; | |
1574 for (my $m = $#sites; $m >= 0; $m--) { | |
1575 if ($m != $n) {substr($pSTY_sequences[$n], $sites[$m], 1) = "";} | |
1576 } | |
1577 } | |
1578 | |
1579 my @formatted_sequences; | |
1580 for my $k (0 .. $#sites) { | |
1581 $formatted_sequences[$k] = &replace_pSpTpY($pSTY_sequences[$k], $phospho_type); | |
1582 } | |
1583 | |
1584 for my $k (0 .. $#formatted_sequences) { | |
1585 print " b #pY $number_pY; #pSTY $number_pSTY; matching formatted motif $formatted_sequences[$k] for peptide $peptide at " . format_localtime_iso8601() ."\n" if ($verbose); | |
1586 if ($phospho_type eq 'y') { | |
1587 $seq_plus5aa = (split(/(\w{0,5}y\w{0,5})/, $formatted_sequences[$k]))[1]; | |
1588 $seq_plus7aa = (split(/(\w{0,7}y\w{0,7})/, $formatted_sequences[$k]))[1]; | |
1589 } | |
1590 elsif ($phospho_type eq "sty") { | |
1591 $seq_plus5aa = (split(/(\w{0,5}(s|t|y)\w{0,5})/, $formatted_sequences[$k]))[1]; | |
1592 $seq_plus7aa = (split(/(\w{0,7}(s|t|y)\w{0,7})/, $formatted_sequences[$k]))[1]; | |
1593 } | |
1594 for my $i (0 .. $#kinases_observed) { | |
1595 my $tmp = $seq_plus5aa."_".$kinases_observed[$i]; #eg, should look like REEILsEMKKV_PKCalpha | |
1596 if (exists($p_sequence_kinase -> {$tmp})) { | |
1597 $kinase_substrate_NetworKIN_matches{$peptide}{$kinases_observed[$i]} = "X"; | |
1598 } | |
1599 } | |
1600 $pSTY_sequence = $formatted_sequences[$k]; | |
1601 for my $i (0 .. $#motif_sequence) { | |
1602 if ($pSTY_sequence =~ /$motif_sequence[$i]/) { | |
1603 $kinase_motif_matches{$peptide}{$motif_type{$motif_type_key_ary[$i]}} = "X"; | |
1604 } | |
1605 } | |
1606 for my $i (0 .. $#kinases_PhosphoSite) { | |
1607 my $tmp = $seq_plus7aa."_".$kinases_PhosphoSite[$i]; #eg, should be RTPGRPLsSYGMDSR_PAK2 | |
1608 #print "seq_plus7aa._.kinases_PhosphoSite[i] is $tmp"; | |
1609 if (exists($p_sequence_kinase_PhosphoSite -> {$tmp})) { | |
1610 $kinase_substrate_PhosphoSite_matches{$peptide}{$kinases_PhosphoSite[$i]} = "X"; | |
1611 } | |
1612 } | |
1613 if (exists($regulatory_sites_PhosphoSite -> {$seq_plus7aa})) { | |
1614 $seq_plus7aa_2{$peptide} = $seq_plus7aa; | |
1615 | |
1616 # $domain | |
1617 if ($domain_2{$peptide} eq "") { | |
1618 $domain_2{$peptide} = $domain{$seq_plus7aa}; | |
1619 } | |
1620 elsif ($domain{$seq_plus7aa} eq "") { | |
1621 # do nothing | |
1622 } | |
1623 else { | |
1624 $domain_2{$peptide} = $domain_2{$peptide}." / ".$domain{$seq_plus7aa}; | |
1625 } | |
1626 | |
1627 | |
1628 # $ON_FUNCTION_2 | |
1629 if ($ON_FUNCTION_2{$peptide} eq "") { | |
1630 $ON_FUNCTION_2{$peptide} = $ON_FUNCTION{$seq_plus7aa}; | |
1631 } | |
1632 elsif ($ON_FUNCTION{$seq_plus7aa} eq "") { | |
1633 # do nothing | |
1634 } | |
1635 else { | |
1636 $ON_FUNCTION_2{$peptide} = $ON_FUNCTION_2{$peptide}." / ".$ON_FUNCTION{$seq_plus7aa}; | |
1637 } | |
1638 | |
1639 # $ON_PROCESS_2 | |
1640 if ($ON_PROCESS_2{$peptide} eq "") { | |
1641 $ON_PROCESS_2{$peptide} = $ON_PROCESS{$seq_plus7aa}; | |
1642 } | |
1643 elsif ($ON_PROCESS{$seq_plus7aa} eq "") { | |
1644 # do nothing | |
1645 } | |
1646 else { | |
1647 $ON_PROCESS_2{$peptide} = $ON_PROCESS_2{$peptide}." / ".$ON_PROCESS{$seq_plus7aa}; | |
1648 } | |
1649 | |
1650 # $ON_PROT_INTERACT_2 | |
1651 if ($ON_PROT_INTERACT_2{$peptide} eq "") { | |
1652 $ON_PROT_INTERACT_2{$peptide} = $ON_PROT_INTERACT{$seq_plus7aa}; | |
1653 } | |
1654 elsif ($ON_PROT_INTERACT{$seq_plus7aa} eq "") { | |
1655 # do nothing | |
1656 } | |
1657 else { | |
1658 $ON_PROT_INTERACT_2{$peptide} = $ON_PROT_INTERACT_2{$peptide}." / ".$ON_PROT_INTERACT{$seq_plus7aa}; | |
1659 } | |
1660 | |
1661 # $ON_OTHER_INTERACT_2 | |
1662 if ($ON_OTHER_INTERACT_2{$peptide} eq "") { | |
1663 $ON_OTHER_INTERACT_2{$peptide} = $ON_OTHER_INTERACT{$seq_plus7aa}; | |
1664 } | |
1665 elsif ($ON_OTHER_INTERACT{$seq_plus7aa} eq "") { | |
1666 # do nothing | |
1667 } | |
1668 else { | |
1669 $ON_OTHER_INTERACT_2{$peptide} = $ON_OTHER_INTERACT_2{$peptide}." / ".$ON_OTHER_INTERACT{$seq_plus7aa}; | |
1670 } | |
1671 | |
1672 # $notes_2 | |
1673 if ($notes_2{$peptide} eq "") { | |
1674 $notes_2{$peptide} = $notes{$seq_plus7aa}; | |
1675 } | |
1676 elsif ($notes{$seq_plus7aa} eq "") { | |
1677 # do nothing | |
1678 } | |
1679 else { | |
1680 $notes_2{$peptide} = $notes_2{$peptide}." / ".$notes{$seq_plus7aa}; | |
1681 } | |
1682 $notes_2{$peptide} = $notes{$seq_plus7aa}; | |
1683 | |
1684 # $organism_2 | |
1685 if ($organism_2{$peptide} eq "") { | |
1686 $organism_2{$peptide} = $organism{$seq_plus7aa}; | |
1687 } | |
1688 elsif ($organism{$seq_plus7aa} eq "") { | |
1689 # do nothing | |
1690 } | |
1691 else { | |
1692 $organism_2{$peptide} = $organism_2{$peptide}." / ".$organism{$seq_plus7aa}; | |
1693 } | |
1694 $organism_2{$peptide} = $organism{$seq_plus7aa}; | |
1695 } else { | |
1696 } # if (exists($regulatory_sites_PhosphoSite -> {$seq_plus7aa})) | |
1697 } # for my $k (0 .. $#formatted_sequences) | |
1698 } # if/else number of phosphosites | |
1699 } # for each motif i # for my $i (0 .. $#{$unique_motifs{$peptide}}) | |
1700 } # for each $peptide | |
1701 | |
1702 my ($end_seconds, $end_microseconds) = gettimeofday; | |
1703 | |
1704 my $delta_seconds = $end_seconds - $start_seconds; | |
1705 my $delta_microseconds = $end_microseconds - $start_microseconds; | |
1706 $delta_microseconds += 1000000 * $delta_seconds; | |
1707 my $key_count = keys(%data); | |
1708 print sprintf("Average search time is %d microseconds per phopshopeptide\n", ($delta_microseconds / $key_count)); | |
1709 | |
1710 ($start_seconds, $start_microseconds) = gettimeofday; | |
1711 | |
1712 print "Writing PSP_Regulatory_site records\n"; | |
1713 | |
1714 my $psp_regulatory_site_stmth = $dbh->prepare(" | |
1715 INSERT INTO PSP_Regulatory_site ( | |
1716 DOMAIN, | |
1717 ON_FUNCTION, | |
1718 ON_PROCESS, | |
1719 ON_PROT_INTERACT, | |
1720 ON_OTHER_INTERACT, | |
1721 NOTES, | |
1722 SITE_PLUSMINUS_7AA, | |
1723 ORGANISM | |
1724 ) VALUES (?,?,?,?,?,?,?,?) | |
1725 "); | |
1726 | |
1727 foreach my $peptide (keys %data) { | |
1728 if (exists($domain_2{$peptide}) and (defined $domain_2{$peptide}) and (not $domain_2{$peptide} eq "") ) { | |
1729 $psp_regulatory_site_stmth->bind_param(1, $domain_2{$peptide}); | |
1730 $psp_regulatory_site_stmth->bind_param(2, $ON_FUNCTION_2{$peptide}); | |
1731 $psp_regulatory_site_stmth->bind_param(3, $ON_PROCESS_2{$peptide}); | |
1732 $psp_regulatory_site_stmth->bind_param(4, $ON_PROT_INTERACT_2{$peptide}); | |
1733 $psp_regulatory_site_stmth->bind_param(5, $ON_OTHER_INTERACT_2{$peptide}); | |
1734 $psp_regulatory_site_stmth->bind_param(6, $notes_2{$peptide}); | |
1735 $psp_regulatory_site_stmth->bind_param(7, $seq_plus7aa_2{$peptide}); | |
1736 $psp_regulatory_site_stmth->bind_param(8, $organism_2{$peptide}); | |
1737 if (not $psp_regulatory_site_stmth->execute()) { | |
1738 print "Error writing PSP_Regulatory_site for one regulatory site with peptide '$domain_2{$peptide}': $psp_regulatory_site_stmth->errstr\n"; | |
1739 } else { | |
1740 } | |
1741 } elsif (exists($domain_2{$peptide}) and (not defined $domain_2{$peptide})) { | |
1742 print "\$domain_2{$peptide} is undefined\n"; #ACE | |
1743 } | |
1744 } | |
1745 | |
1746 $dbh->{AutoCommit} = $auto_commit; | |
1747 # auto_commit implicitly finishes psp_regulatory_site_stmth, apparently # $psp_regulatory_site_stmth->finish; | |
1748 $dbh->disconnect if ( defined $dbh ); | |
1749 | |
1750 | |
1751 ($end_seconds, $end_microseconds) = gettimeofday; | |
1752 | |
1753 $delta_seconds = $end_seconds - $start_seconds; | |
1754 $delta_microseconds = $end_microseconds - $start_microseconds; | |
1755 $delta_microseconds += 1000000 * $delta_seconds; | |
1756 $key_count = keys(%data); | |
1757 print sprintf("Write time is %d microseconds\n", ($delta_microseconds)); | |
1758 | |
1759 print "... Finished find sequences that match the NetworKIN predictions and find motifs that match observed sequences at " . format_localtime_iso8601() ."\n\n"; | |
1760 | |
1761 ############################################################################################################################### | |
1762 # | |
1763 # Print to the output file | |
1764 # | |
1765 ############################################################################################################################### | |
1766 | |
1767 | |
1768 open (OUT, ">$file_out") || die "could not open the fileout: $file_out"; | |
1769 open (MELT, ">$file_melt") || die "could not open the fileout: $file_melt"; | |
1770 | |
1771 # print the header info | |
1772 print MELT "phospho_peptide\tgene_names\tsite_type\tkinase_map\n"; | |
1773 print OUT "p-peptide\tProtein description\tGene name(s)\tFASTA name\tPhospho-sites\tUnique phospho-motifs, no residue numbers\tAccessions\tPhospho-motifs for all members of protein group with residue numbers\t"; | |
1774 | |
1775 # print the PhosphoSite regulatory data | |
1776 print OUT "Domain\tON_FUNCTION\tON_PROCESS\tON_PROT_INTERACT\tON_OTHER_INTERACT\tPhosphoSite notes\t"; | |
1777 | |
1778 # print the sample names | |
1779 for my $i (0 .. $#samples) { print OUT "$samples[$i]\t"; } | |
1780 | |
1781 # print the kinases and groups | |
1782 for my $i (0 .. $#kinases_observed) { | |
1783 my $temp = $kinases_observed[$i]."_NetworKIN"; | |
1784 print OUT "$temp\t"; | |
1785 push(@kinases_observed_lbl, $temp); | |
1786 } | |
1787 my @motif_type_keys = keys %motif_type; | |
1788 for my $i (1 .. $#motif_type_keys) { | |
1789 print OUT "$motif_type{$motif_type_keys[$i]}\t"; | |
1790 } | |
1791 for my $i (0 .. $#kinases_PhosphoSite) { | |
1792 my $temp = $kinases_PhosphoSite[$i]; # ."_PhosphoSite"; | |
1793 if ($i < $#kinases_PhosphoSite) { print OUT "$temp\t"; } | |
1794 if ($i == $#kinases_PhosphoSite) { print OUT "$temp\n"; } | |
1795 push(@phosphosites_observed_lbl, $temp); | |
1796 } | |
1797 | |
1798 # begin DDL-to-SQLite | |
1799 # --- | |
1800 $dbh = DBI->connect("dbi:SQLite:$db_out", undef, undef); | |
1801 $auto_commit = $dbh->{AutoCommit}; | |
1802 $dbh->{AutoCommit} = 0; | |
1803 print "DB connection $dbh is to $db_out, opened for modification\n"; | |
1804 | |
1805 my $sample_stmth; | |
1806 $sample_stmth = $dbh->prepare(" | |
1807 INSERT INTO sample ( | |
1808 id, | |
1809 name | |
1810 ) VALUES (?,?) | |
1811 "); | |
1812 | |
1813 my $ppep_intensity_stmth; | |
1814 $ppep_intensity_stmth = $dbh->prepare(" | |
1815 INSERT INTO ppep_intensity ( | |
1816 ppep_id, | |
1817 sample_id, | |
1818 intensity | |
1819 ) VALUES (?,?,?) | |
1820 "); | |
1821 | |
1822 my $site_type_stmth; | |
1823 $site_type_stmth = $dbh->prepare(" | |
1824 insert into site_type ( | |
1825 id, | |
1826 type_name | |
1827 ) values (?,?) | |
1828 "); | |
1829 | |
1830 my $ppep_gene_site_stmth; | |
1831 $ppep_gene_site_stmth = $dbh->prepare(" | |
1832 insert into ppep_gene_site ( | |
1833 ppep_id, | |
1834 gene_names, | |
1835 kinase_map, | |
1836 site_type_id | |
1837 ) values (?,?,?,?) | |
1838 "); | |
1839 | |
1840 my $ppep_metadata_stmth; | |
1841 $ppep_metadata_stmth = $dbh->prepare(" | |
1842 INSERT INTO ppep_metadata | |
1843 ( ppep_id | |
1844 , protein_description | |
1845 , gene_name | |
1846 , FASTA_name | |
1847 , phospho_sites | |
1848 , motifs_unique | |
1849 , accessions | |
1850 , motifs_all_members | |
1851 , domain | |
1852 , ON_FUNCTION | |
1853 , ON_PROCESS | |
1854 , ON_PROT_INTERACT | |
1855 , ON_OTHER_INTERACT | |
1856 , notes | |
1857 ) VALUES ( | |
1858 ?,?,?,?,?,?,? | |
1859 , ?,?,?,?,?,?,? | |
1860 ) | |
1861 "); | |
1862 # end DDL-to-SQLite | |
1863 # ... | |
1864 | |
1865 # begin store-to-SQLite "sample" table | |
1866 # --- | |
1867 # %sample_id_lut maps name -> ID | |
1868 for my $sample_name (keys %sample_id_lut) { | |
1869 $sample_stmth->bind_param( 2, $sample_name ); | |
1870 $sample_stmth->bind_param( 1, $sample_id_lut{$sample_name} ); | |
1871 if (not $sample_stmth->execute()) { | |
1872 print "Error writing tuple ($sample_name,$sample_id_lut{$sample_name}): $sample_stmth->errstr\n"; | |
1873 } | |
1874 } | |
1875 # end store-to-SQLite "sample" table | |
1876 # ... | |
1877 | |
1878 # begin store-to-SQLite "site_type" table | |
1879 # --- | |
1880 sub add_site_type { | |
1881 my ($site_type_id, $site_type_type_name) = @_; | |
1882 $site_type_stmth->bind_param( 2, $site_type_type_name ); | |
1883 $site_type_stmth->bind_param( 1, $site_type_id ); | |
1884 if (not $site_type_stmth->execute()) { | |
1885 die "Error writing tuple ($site_type_id,$site_type_type_name): $site_type_stmth->errstr\n"; | |
1886 } | |
1887 } | |
1888 add_site_type($SITE_KINASE_SUBSTRATE, $site_description{$SITE_KINASE_SUBSTRATE}); | |
1889 add_site_type($SITE_HPRD , $site_description{$SITE_HPRD }); | |
1890 add_site_type($SITE_PHOSIDA , $site_description{$SITE_PHOSIDA }); | |
1891 add_site_type($SITE_PHOSPHOSITE , $site_description{$SITE_PHOSPHOSITE }); | |
1892 # end store-to-SQLite "site_type" table | |
1893 # ... | |
1894 | |
1895 foreach my $peptide (sort(keys %data)) { | |
1896 next if (grep($peptide, @failed_matches)); | |
1897 my $ppep_id = $ppep_id_lut{$peptide}; | |
1898 my @ppep_metadata = (); | |
1899 my @ppep_intensity = (); | |
1900 my @gene = (); | |
1901 my $gene_names; | |
1902 my $j; | |
1903 # Print the peptide itself | |
1904 # column 1: p-peptide | |
1905 print OUT "$peptide\t"; | |
1906 push (@ppep_metadata, $ppep_id); | |
1907 push (@ppep_intensity, $peptide); | |
1908 | |
1909 my $verbose_cond = 0; # $peptide eq 'AAAAAAAGDpSDpSWDADAFSVEDPVR' || $peptide eq 'KKGGpSpSDEGPEPEAEEpSDLDSGSVHSASGRPDGPVR'; | |
1910 # skip over failed matches | |
1911 print "\nfirst match for '$peptide' is '$matched_sequences{$peptide}[0]' and FAILED_MATCH_SEQ is '$FAILED_MATCH_SEQ'\n" if $verbose_cond; | |
1912 if ($matched_sequences{$peptide}[0] eq $FAILED_MATCH_SEQ) { | |
1913 # column 2: Protein description | |
1914 # column 3: Gene name(s) | |
1915 # column 4: FASTA name | |
1916 # column 5: phospho-residues | |
1917 # Column 6: UNIQUE phospho-motifs | |
1918 # Column 7: accessions | |
1919 # Column 8: ALL motifs with residue numbers | |
1920 # 2 3 4 5 6 7 8 | |
1921 print OUT "Sequence not found in FASTA database\tNA\tNA\tNA\tNA\tNA\tNA\t"; | |
1922 print "No match found for '$peptide' in sequence database\n"; | |
1923 $gene_names = '$FAILED_MATCH_GENE_NAME'; | |
1924 } else { | |
1925 my @description = (); | |
1926 my %seen = (); | |
1927 # Print just the protein description | |
1928 for $i (0 .. $#{$names{$peptide}}) { | |
1929 my $long_name = $names{$peptide}[$i]; | |
1930 my @naming_parts = split(/\sOS/, $long_name); | |
1931 my @front_half = split(/\s/, $naming_parts[0]); | |
1932 push(@description, join(" ", @front_half[1..($#front_half)])); | |
1933 } | |
1934 # column 2: Protein description | |
1935 print OUT join(" /// ", @description), "\t"; | |
1936 push (@ppep_metadata, join(" /// ", @description)); | |
1937 | |
1938 # Print just the gene name | |
1939 for $i (0 .. $#{$names{$peptide}}) { | |
1940 my $tmp_gene = $names{$peptide}[$i]; | |
1941 $tmp_gene =~ s/^.*GN=//; | |
1942 $tmp_gene =~ s/\s.*//; | |
1943 if (!exists($seen{$tmp_gene})) { | |
1944 push(@gene, $tmp_gene); | |
1945 $seen{$tmp_gene} = $tmp_gene; | |
1946 } | |
1947 } | |
1948 # column 3: Gene name(s) | |
1949 $gene_names = join(" /// ", @gene); | |
1950 print OUT $gene_names, "\t"; | |
1951 push (@ppep_metadata, join(" /// ", @gene)); | |
1952 | |
1953 # column 4: FASTA name | |
1954 print OUT join(" /// ", @{$names{$peptide}}), "\t"; | |
1955 push (@ppep_metadata, join(" /// ", @{$names{$peptide}})); | |
1956 | |
1957 # column 5: phospho-residues | |
1958 my $tmp_for_insert = ""; | |
1959 my $foobar; | |
1960 for my $i (0 .. $#{ $matched_sequences{$peptide} } ) { | |
1961 print "match $i for '$peptide' is '$matched_sequences{$peptide}[$i]'\n" if $verbose_cond; | |
1962 if ($i < $#{ $matched_sequences{$peptide} }) { | |
1963 if (defined $p_residues{$peptide}{$i}) { | |
1964 @tmp_p_residues = @{$p_residues{$peptide}{$i}}; | |
1965 for $j (0 .. $#tmp_p_residues) { | |
1966 if ($j < $#tmp_p_residues) { | |
1967 my $tmp_site_for_printing = $p_residues{$peptide}{$i}[$j] + 1; # added 12.05.2012 for Justin's data | |
1968 print OUT "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing, "; | |
1969 $tmp_for_insert .= "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing, "; | |
1970 } | |
1971 elsif ($j == $#tmp_p_residues) { | |
1972 my $tmp_site_for_printing = $p_residues{$peptide}{$i}[$j] + 1; # added 12.05.2012 for Justin's data | |
1973 print OUT "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing /// "; | |
1974 $tmp_for_insert .= "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing /// "; | |
1975 } | |
1976 } | |
1977 } | |
1978 } | |
1979 elsif ($i == $#{ $matched_sequences{$peptide} }) { | |
1980 my $there_were_sites = 0; | |
1981 if (defined $p_residues{$peptide}{$i}) { | |
1982 @tmp_p_residues = @{$p_residues{$peptide}{$i}}; | |
1983 if ($#tmp_p_residues > 0) { | |
1984 for my $j (0 .. $#tmp_p_residues) { | |
1985 if ($j < $#tmp_p_residues) { | |
1986 if (defined $p_residues{$peptide}{$i}[$j]) { | |
1987 my $tmp_site_for_printing = $p_residues{$peptide}{$i}[$j] + 1; # added 12.05.2012 for Justin's data | |
1988 $foobar = $residues{$peptide}{$i}[$j]; | |
1989 if (defined $foobar) { | |
1990 print OUT "$foobar"; | |
1991 print OUT "$tmp_site_for_printing, "; | |
1992 $tmp_for_insert .= "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing, "; | |
1993 $there_were_sites = 1; | |
1994 } | |
1995 } | |
1996 } | |
1997 elsif ($j == $#tmp_p_residues) { | |
1998 if (defined $p_residues{$peptide}{$i}[$j]) { | |
1999 $foobar = $residues{$peptide}{$i}[$j]; | |
2000 if (defined $foobar) { | |
2001 my $tmp_site_for_printing = $p_residues{$peptide}{$i}[$j] + 1; # added 12.05.2012 for Justin's data | |
2002 print OUT "$foobar"; | |
2003 print OUT "$tmp_site_for_printing\t"; | |
2004 $tmp_for_insert .= "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing"; | |
2005 $there_were_sites = 1; | |
2006 } | |
2007 } | |
2008 } | |
2009 } | |
2010 } | |
2011 } | |
2012 if (0 == $there_were_sites) { | |
2013 print OUT "\t"; | |
2014 } | |
2015 } | |
2016 } | |
2017 print "tmp_for_insert '$tmp_for_insert' for '$peptide'\n" if $verbose_cond; | |
2018 push (@ppep_metadata, $tmp_for_insert); | |
2019 | |
2020 # Column 6: UNIQUE phospho-motifs | |
2021 print OUT join(" /// ", @{$unique_motifs{$peptide}}), "\t"; | |
2022 push (@ppep_metadata, join(" /// ", @{$unique_motifs{$peptide}})); | |
2023 | |
2024 # Column 7: accessions | |
2025 if (defined $accessions{$peptide}) { | |
2026 print OUT join(" /// ", @{$accessions{$peptide}}), "\t"; | |
2027 push (@ppep_metadata, join(" /// ", @{$accessions{$peptide}})); | |
2028 } else { | |
2029 print OUT "\t"; | |
2030 push (@ppep_metadata, ""); | |
2031 } | |
2032 | |
2033 # Column 8: ALL motifs with residue numbers | |
2034 if (defined $p_motifs{$peptide}) { | |
2035 print OUT join(" /// ", @{$p_motifs{$peptide}}), "\t"; | |
2036 push (@ppep_metadata, join(" /// ", @{$p_motifs{$peptide}})); | |
2037 } else { | |
2038 print OUT "\t"; | |
2039 push (@ppep_metadata, ""); | |
2040 } | |
2041 | |
2042 } | |
2043 | |
2044 # Print the PhosphoSite regulatory data | |
2045 | |
2046 if (defined $domain_2{$peptide}) { print OUT "$domain_2{$peptide}\t"; } else { print OUT "\t"; } | |
2047 if (defined $ON_FUNCTION_2{$peptide}) { print OUT "$ON_FUNCTION_2{$peptide}\t"; } else { print OUT "\t"; } | |
2048 if (defined $ON_PROCESS_2{$peptide}) { print OUT "$ON_PROCESS_2{$peptide}\t"; } else { print OUT "\t"; } | |
2049 if (defined $ON_PROT_INTERACT_2{$peptide}) { print OUT "$ON_PROT_INTERACT_2{$peptide}\t"; } else { print OUT "\t"; } | |
2050 if (defined $ON_OTHER_INTERACT_2{$peptide}) { print OUT "$ON_OTHER_INTERACT_2{$peptide}\t"; } else { print OUT "\t"; } | |
2051 if (defined $notes_2{$peptide}) { print OUT "$notes_2{$peptide}\t"; } else { print OUT "\t"; } | |
2052 | |
2053 if (defined $domain_2{$peptide}) { push (@ppep_metadata, $domain_2{$peptide}); } else { push(@ppep_metadata, ""); } | |
2054 if (defined $ON_FUNCTION_2{$peptide}) { push (@ppep_metadata, $ON_FUNCTION_2{$peptide}); } else { push(@ppep_metadata, ""); } | |
2055 if (defined $ON_PROCESS_2{$peptide}) { push (@ppep_metadata, $ON_PROCESS_2{$peptide}); } else { push(@ppep_metadata, ""); } | |
2056 if (defined $ON_PROT_INTERACT_2{$peptide}) { push (@ppep_metadata, $ON_PROT_INTERACT_2{$peptide}); } else { push(@ppep_metadata, ""); } | |
2057 if (defined $ON_OTHER_INTERACT_2{$peptide}) { push (@ppep_metadata, $ON_OTHER_INTERACT_2{$peptide}); } else { push(@ppep_metadata, ""); } | |
2058 if (defined $notes_2{$peptide}) { push (@ppep_metadata, $notes_2{$peptide}); } else { push(@ppep_metadata, ""); } | |
2059 | |
2060 # begin store-to-SQLite "ppep_metadata" table | |
2061 # --- | |
2062 for $i (1..14) { | |
2063 $ppep_metadata_stmth->bind_param($i, $ppep_metadata[$i-1]); | |
2064 } | |
2065 if (not $ppep_metadata_stmth->execute()) { | |
2066 print "Error writing ppep_metadata row for phosphopeptide $ppep_metadata[$i]: $ppep_metadata_stmth->errstr\n"; | |
2067 } | |
2068 # ... | |
2069 # end store-to-SQLite "ppep_metadata" table | |
2070 | |
2071 # Print the data | |
2072 @tmp_data = (); | |
2073 foreach (@{$data{$peptide}}) { | |
2074 push(@tmp_data, $_); | |
2075 } | |
2076 print OUT join("\t", @tmp_data), "\t"; | |
2077 | |
2078 # begin store-to-SQLite "ppep_intensity" table | |
2079 # --- | |
2080 # commit the sample intensities | |
2081 $i = 0; | |
2082 foreach (@{$data{$peptide}}) { | |
2083 my $intense = $_; | |
2084 $ppep_intensity_stmth->bind_param( 1, $ppep_id ); | |
2085 $ppep_intensity_stmth->bind_param( 2, $sample_id_lut{$samples[$i]} ); | |
2086 $ppep_intensity_stmth->bind_param( 3, $intense ); | |
2087 if (not $ppep_intensity_stmth->execute()) { | |
2088 print "Error writing tuple ($peptide,$samples[$i],$intense): $ppep_intensity_stmth->errstr\n"; | |
2089 } | |
2090 $i += 1; | |
2091 } | |
2092 # ... | |
2093 # end store-to-SQLite "ppep_intensity" table | |
2094 | |
2095 # print the kinase-substrate data | |
2096 for my $i (0 .. $#kinases_observed) { | |
2097 if (exists($kinase_substrate_NetworKIN_matches{$peptide}{$kinases_observed[$i]})) { | |
2098 print OUT "X\t"; | |
2099 my $NetworKIN_label = $kinases_observed[$i]; #."_NetworKIN"; | |
2100 print MELT "$peptide\t$gene_names\t$site_description{$SITE_KINASE_SUBSTRATE}\t$NetworKIN_label\n"; | |
2101 # begin store-to-SQLite "ppep_gene_site" table | |
2102 # --- | |
2103 $ppep_gene_site_stmth->bind_param(1, $ppep_id); # ppep_gene_site.ppep_id | |
2104 $ppep_gene_site_stmth->bind_param(2, $gene_names); # ppep_gene_site.gene_names | |
2105 $ppep_gene_site_stmth->bind_param(3, $NetworKIN_label); # ppep_gene_site.kinase_map | |
2106 $ppep_gene_site_stmth->bind_param(4, $SITE_KINASE_SUBSTRATE); # ppep_gene_site.site_type_id | |
2107 if (not $ppep_gene_site_stmth->execute()) { | |
2108 print "Error writing tuple ($peptide,$gene_names,$kinases_observed[$i]): $ppep_gene_site_stmth->errstr\n"; | |
2109 } | |
2110 # ... | |
2111 # end store-to-SQLite "ppep_gene_site" table | |
2112 } | |
2113 else { print OUT "\t";} | |
2114 } | |
2115 my %wrote_motif; | |
2116 my $motif_parts_0; | |
2117 my @motif_split; | |
2118 my $one_motif; | |
2119 | |
2120 for my $i (0 .. $#motif_type_keys) { | |
2121 if (exists($kinase_motif_matches{$peptide}{$motif_type_keys[$i]})) { | |
2122 print OUT "X\t"; | |
2123 #ACE-2022.06.20 $motif_parts_0 = $motif_type{$motif_sequence[$i]}." ".$motif_sequence[$i]; | |
2124 $motif_parts_0 = $motif_type{$motif_type_keys[$i]}; | |
2125 @motif_split = split("[|]", $motif_parts_0); | |
2126 #ACE-2022.06.20 my $key = "$peptide\t$gene_names\t$motif_parts_0"; | |
2127 for my $j (0 .. $#motif_split) { | |
2128 $one_motif = $motif_split[$j]; | |
2129 #ACE-2022.06.20 my $key = "$peptide\t$gene_names\t$motif_parts_0"; | |
2130 my $key = "$peptide\t$gene_names\t$one_motif"; | |
2131 if (!exists($wrote_motif{$key})) { | |
2132 $wrote_motif{$key} = $key; | |
2133 print MELT "$peptide\t$gene_names\t$motif_description[$i]\t$one_motif\n"; | |
2134 # print "Line 657: i is $i\t$kinase_motif_matches{$peptide}{$motif_sequence[$i]}\n"; #debug | |
2135 # begin store-to-SQLite "ppep_gene_site" table | |
2136 # --- | |
2137 $ppep_gene_site_stmth->bind_param(1, $ppep_id); # ppep_gene_site.ppep_id | |
2138 $ppep_gene_site_stmth->bind_param(2, $gene_names); # ppep_gene_site.gene_names | |
2139 $ppep_gene_site_stmth->bind_param(3, $one_motif); # ppep_gene_site.kinase_map | |
2140 $ppep_gene_site_stmth->bind_param(4, $site_id{$motif_description[$i]}); # ppep_gene_site.site_type_id | |
2141 if (not $ppep_gene_site_stmth->execute()) { | |
2142 print "Error writing tuple ($peptide,$gene_names,$one_motif): $ppep_gene_site_stmth->errstr\n"; | |
2143 } | |
2144 # ... | |
2145 # end store-to-SQLite "ppep_gene_site" table | |
2146 } | |
2147 } | |
2148 } | |
2149 else { print OUT "\t";} | |
2150 } | |
2151 for my $i (0 .. $#kinases_PhosphoSite) { | |
2152 if (exists($kinase_substrate_PhosphoSite_matches{$peptide}{$kinases_PhosphoSite[$i]})) { | |
2153 print MELT "$peptide\t$gene_names\t$site_description{$SITE_PHOSPHOSITE}\t$phosphosites_observed_lbl[$i]\n"; | |
2154 if ($i < $#kinases_PhosphoSite) { | |
2155 print OUT "X\t"; | |
2156 } | |
2157 else { | |
2158 print OUT "X\n"; | |
2159 } | |
2160 # begin store-to-SQLite "ppep_gene_site" table | |
2161 # --- | |
2162 $ppep_gene_site_stmth->bind_param(1, $ppep_id); # ppep_gene_site.ppep_id | |
2163 $ppep_gene_site_stmth->bind_param(2, $gene_names); # ppep_gene_site.gene_names | |
2164 $ppep_gene_site_stmth->bind_param(3, $phosphosites_observed_lbl[$i]); # ppep_gene_site.kinase_map | |
2165 $ppep_gene_site_stmth->bind_param(4, $SITE_PHOSPHOSITE); # ppep_gene_site.site_type_id | |
2166 if (not $ppep_gene_site_stmth->execute()) { | |
2167 print "Error writing tuple ($peptide,$gene_names,$phosphosites_observed_lbl[$i]): $ppep_gene_site_stmth->errstr\n"; | |
2168 } | |
2169 # ... | |
2170 # end store-to-SQLite "ppep_gene_site" table | |
2171 } | |
2172 else { | |
2173 if ($i < $#kinases_PhosphoSite) { | |
2174 print OUT "\t"; | |
2175 } | |
2176 elsif ($i == $#kinases_PhosphoSite) { | |
2177 print OUT "\n"; | |
2178 } | |
2179 } | |
2180 } | |
2181 } | |
2182 | |
2183 close OUT; | |
2184 close MELT; | |
2185 $ppep_gene_site_stmth->finish; | |
2186 print "begin DB commit at " . format_localtime_iso8601() . "\n"; | |
2187 $dbh->{AutoCommit} = $auto_commit; | |
2188 $dbh->disconnect if ( defined $dbh ); | |
2189 | |
2190 print "\nFinished writing output at " . format_localtime_iso8601() ."\n\n"; | |
2191 | |
2192 ############################################################################################################################### |