comparison PhosphoPeptide_Upstream_Kinase_Mapping.pl @ 0:8dfd5d2b5903 draft

planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit 3a7b3609d6e514c9e8f980ecb684960c6b2252fe
author galaxyp
date Mon, 11 Jul 2022 19:22:54 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:8dfd5d2b5903
1 #!/usr/local/bin/perl
2 ###############################################################################################################################
3 # perl Kinase_enrichment_analysis_complete_v0.pl
4 #
5 # Nick Graham, USC
6 # 2016-02-27
7 #
8 # Built from scripts written by NG at UCLA in Tom Graeber's lab:
9 # CombinePhosphoSites.pl
10 # Retrieve_p_motifs.pl
11 # NetworKIN_Motif_Finder_v7.pl
12 #
13 # Given a list of phospho-peptides, find protein information and upstream kinases.
14 # Output file can be used for KS enrichment score calculations using Enrichment_Score4Directory.pl
15 #
16 # Updated 2022-01-13, Art Eschenlauer, UMN on behalf of Justin Drake's lab:
17 # Added warnings and used strict;
18 # fixed some code paths resulting in more NetworKIN matches;
19 # applied Aho-Corasick algorithm (via external Python script because Perl implementation was still too slow)
20 # to speed up "Match the non_p_peptides to the @sequences array";
21 # added support for SQLite-formatted UniProtKB/Swiss-Prot data as an alternative to FASTA-formatted data;
22 # added support for SQLite output in addition to tabular files.
23 #
24 #
25 ###############################################################################################################################
26
27 use strict;
28 use warnings 'FATAL' => 'all';
29
30 use Getopt::Std;
31 use DBD::SQLite::Constants qw/:file_open/;
32 use DBI qw(:sql_types);
33 use File::Copy;
34 use File::Basename;
35 use POSIX qw(strftime);
36 use Time::HiRes qw(gettimeofday);
37 #use Data::Dump qw(dump);
38
39 my $USE_SEARCH_PPEP_PY = 1;
40 #my $FAILED_MATCH_SEQ = "Failed match";
41 my $FAILED_MATCH_SEQ = 'No Sequence';
42 my $FAILED_MATCH_GENE_NAME = 'No_Gene_Name';
43
44 my $dirname = dirname(__FILE__);
45 my %opts;
46 my ($file_in, $average_or_sum, $db_out, $file_out, $file_melt, $phospho_type);
47 my $dbtype;
48 my ($fasta_in, $networkin_in, $motifs_in, $PSP_Kinase_Substrate_in, $PSP_Regulatory_Sites_in);
49 my (@samples, %sample_id_lut, %ppep_id_lut, %data, @tmp_data, %n);
50 my $line = 0;
51 my @failed_match = ($FAILED_MATCH_SEQ);
52 my @failed_matches;
53 my (%all_data);
54 my (@p_peptides, @non_p_peptides);
55 my @parsed_fasta;
56 my (@accessions, @names, @sequences, @databases, $database);
57 my ($dbfile, $dbh, $stmth);
58 my @col_names;
59 my (%matched_sequences, %accessions, %names, %sites, );
60 my (@tmp_matches, @tmp_accessions, @tmp_names, @tmp_sites);
61 my (%p_residues, @tmp_p_residues, @p_sites, $left, $right, %p_motifs, @tmp_motifs_array, $tmp_motif, $tmp_site, %residues);
62 my (@kinases_observed, $kinases);
63 my (@kinases_observed_lbl, @phosphosites_observed_lbl);
64 my ($p_sequence_kinase, $p_sequence, $kinase);
65 my (@motif_sequence, @motif_description, @motif_type_key_ary, %motif_type, %motif_count);
66 my (@kinases_PhosphoSite, $kinases_PhosphoSite);
67 my ($p_sequence_kinase_PhosphoSite, $p_sequence_PhosphoSite, $kinase_PhosphoSite);
68 my (%regulatory_sites_PhosphoSite_hash);
69 my (%domain, %ON_FUNCTION, %ON_PROCESS, %ON_PROT_INTERACT, %ON_OTHER_INTERACT, %notes, %organism);
70 my (%unique_motifs);
71 my ($kinase_substrate_NetworKIN_matches, $kinase_substrate_PhosphoSite_matches);
72 my %psp_regsite_protein_2;
73 my (%domain_2, %ON_FUNCTION_2, %ON_PROCESS_2, %ON_PROT_INTERACT_2, %N_PROT_INTERACT, %ON_OTHER_INTERACT_2, %notes_2, %organism_2);
74 my @timeData;
75 my $PhosphoSitePlusCitation;
76 my (%site_description, %site_id);
77
78 my %kinase_substrate_NetworKIN_matches;
79 my %kinase_motif_matches;
80 my $regulatory_sites_PhosphoSite;
81 my ($seq_plus5aa, $seq_plus7aa, %seq_plus7aa_2);
82 my %kinase_substrate_PhosphoSite_matches;
83 my @formatted_sequence;
84 my $pSTY_sequence;
85 my $i;
86 my @a;
87 my $use_sqlite;
88 my $verbose;
89
90 ##########
91 ## opts ##
92 ##########
93 ## input files
94 # i : path to input file, e.g., 'outputfile_STEP2.txt'
95 # f : path to UniProtKB/SwissProt FASTA
96 # s : optional species argument
97 # n : path to NetworKIN_201612_cutoffscore2.0.txt
98 # m : path to pSTY_Motifs.txt
99 # p : path to 2017-03_PSP_Kinase_Substrate_Dataset.txt
100 # r : path to 2017-03_PSP_Regulatory_sites.txt
101 ## options
102 # P : phospho_type
103 # F : function
104 # v : verbose output
105 ## output files
106 # o : path to output file
107 # O : path to "melted" output file
108 # D : path to output SQLite file
109
110 sub usage()
111 {
112 print STDERR <<"EOH";
113 This program given a list of phospho-peptides, finds protein information and upstream kinases.
114 usage: $0 [-hvd] -f FASTA_file
115 -h : this (help) message
116 -v : slightly verbose
117 -a : use SQLite less
118 ## input files
119 -i : path to input file, e.g., 'outputfile_STEP2.txt'
120 -f : path to UniProtDB/SwissProt FASTA
121 -s : optional species filter argument for PSP records; defaults to 'human'
122 -n : path to NetworKIN_201612_cutoffscore2.0.txt
123 -m : path to pSTY_Motifs.txt
124 -p : path to 2017-03_PSP_Kinase_Substrate_Dataset.txt
125 -r : path to 2017-03_PSP_Regulatory_sites.txt
126 ## options
127 -P : phospho_type
128 -F : function
129 ## output files
130 -o : path to output file
131 -O : path to "melted" output file
132 -D : path to output SQLite file
133 example: $0
134 EOH
135 exit;
136 }
137
138 sub format_localtime_iso8601 {
139 # ref: https://perldoc.perl.org/Time::HiRes
140 my ($seconds, $microseconds) = gettimeofday;
141 # ref: https://pubs.opengroup.org/onlinepubs/9699919799/functions/strftime.html
142 return strftime("%Y-%m-%dT%H:%M:%S",localtime(time)) . sprintf(".%03d", $microseconds/1000);
143 }
144
145 sub replace_pSpTpY {
146 my ($formatted_sequence, $phospho_type) = @_;
147 if ($phospho_type eq 'y') {
148 $formatted_sequence =~ s/pS/S/g;
149 $formatted_sequence =~ s/pT/T/g;
150 $formatted_sequence =~ s/pY/y/g;
151 }
152 elsif ($phospho_type eq "sty") {
153 $formatted_sequence =~ s/pS/s/g;
154 $formatted_sequence =~ s/pT/t/g;
155 $formatted_sequence =~ s/pY/y/g;
156 }
157 $formatted_sequence;
158 }
159
160 sub pseudo_sed
161 {
162 # pseudo_sed produces "UniProt_ID\tDescription\tOS\tOX\tGN\tPE\tSV"
163 # Comments give the sed equivalent
164 my ($t) = @_;
165 my $s = $t;
166 # / GN=/!{ s:\(OX=[^ \t]*\):\1 GN=N/A:; };
167 unless ($s =~ m / GN=/s)
168 {
169 $s =~ s :(OX=[^ \t]*):${1} GN=N/A:s;
170 }
171 # / PE=/!{ s:\(GN=[^ \t]*\):\1 PE=N/A:; };
172 unless ($s =~ m / PE=/s)
173 {
174 $s =~ s :(GN=[^ \t]*):${1} PE=N/A:s;
175 }
176 # / SV=/!{ s:\(PE=[^ \t]*\):\1 SV=N/A:; };
177 unless ($s =~ m / SV=/s)
178 {
179 $s =~ s :(PE=[^ \t]*):${1} SV=N/A:s;
180 }
181 # s/^sp.//;
182 $s =~ s :^...::s;
183 # s/[|]/\t/g;
184 $s =~ s :[|]:\t:sg;
185 if ( !($s =~ m/ OX=/s)
186 && !($s =~ m/ GN=/s)
187 && !($s =~ m/ PE=/s)
188 && !($s =~ m/ SV=/s)
189 ) {
190 # OS= is used elsewhere, but it's not helpful without OX and GN
191 $s =~ s/OS=/Species /g;
192 # supply sensible default values
193 $s .= "\tN/A\t-1\tN/A\tN/A\tN/A";
194 } else {
195 # s/ OS=/\t/;
196 if ($s =~ m/ OS=/s) { $s =~ s: OS=:\t:s; } else { $s =~ s:(.*)\t:$1\tN/A\t:x; };
197 # s/ OX=/\t/;
198 if ($s =~ m/ OX=/s) { $s =~ s: OX=:\t:s; } else { $s =~ s:(.*)\t:$1\t-1\t:x; };
199 # s/ GN=/\t/;
200 if ($s =~ m/ GN=/s) { $s =~ s: GN=:\t:s; } else { $s =~ s:(.*)\t:$1\tN/A\t:x; };
201 # s/ PE=/\t/;
202 if ($s =~ m/ PE=/s) { $s =~ s: PE=:\t:s; } else { $s =~ s:(.*)\t:$1\tN/A\t:x; };
203 # s/ SV=/\t/;
204 if ($s =~ m/ SV=/s) { $s =~ s: SV=:\t:s; } else { $s =~ s:(.*)\t:$1\tN/A\t:x; };
205 }
206 return $s;
207 } # sub pseudo_sed
208
209 getopts('i:f:s:n:m:p:r:P:F:o:O:D:hva', \%opts) ;
210
211
212 if (exists($opts{'h'})) {
213 usage();
214 }
215 if (exists($opts{'a'})) {
216 $USE_SEARCH_PPEP_PY = 0;
217 }
218 if (exists($opts{'v'})) {
219 $verbose = 1;
220 } else {
221 $verbose = 0;
222 }
223 if (!exists($opts{'i'}) || !-e $opts{'i'}) {
224 die('Input File not found');
225 } else {
226 $file_in = $opts{'i'};
227 }
228 if (!exists($opts{'f'}) || !-e $opts{'f'}) {
229 die('FASTA not found');
230 } else {
231 $fasta_in = $opts{'f'};
232 $use_sqlite = 0;
233 }
234 my $species;
235 if ((!exists($opts{'s'})) || ($opts{'s'} eq '')) {
236 $species = 'human';
237 } else {
238 $species = $opts{'s'};
239 print "'-s' option is '$species'\n";
240 }
241 print "species filter is '$species'\n";
242
243 if (!exists($opts{'n'}) || !-e $opts{'n'}) {
244 die('Input NetworKIN File not found');
245 } else {
246 $networkin_in = $opts{'n'};
247 }
248 if (!exists($opts{'m'}) || !-e $opts{'m'}) {
249 die('Input pSTY_Motifs File not found');
250 } else {
251 $motifs_in = $opts{'m'};
252 }
253 if (!exists($opts{'p'}) || !-e $opts{'p'}) {
254 die('Input PSP_Kinase_Substrate_Dataset File not found');
255 } else {
256 $PSP_Kinase_Substrate_in = $opts{'p'};
257 }
258 if (!exists($opts{'r'}) || !-e $opts{'r'}) {
259 die('Input PSP_Regulatory_sites File not found');
260 } else {
261 $PSP_Regulatory_Sites_in = $opts{'r'};
262 }
263 if (exists($opts{'P'})) {
264 $phospho_type = $opts{'P'};
265 }
266 else {
267 $phospho_type = "sty";
268 }
269 if (exists($opts{'F'})) {
270 $average_or_sum = $opts{'F'};
271 }
272 else {
273 $average_or_sum = "sum";
274 }
275 if (exists($opts{'D'})) {
276 $db_out = $opts{'D'};
277 }
278 else {
279 $db_out = "db_out.sqlite";
280 }
281 if (exists($opts{'O'})) {
282 $file_melt = $opts{'O'};
283 }
284 else {
285 $file_melt = "output_melt.tsv";
286 }
287 if (exists($opts{'o'})) {
288 $file_out = $opts{'o'};
289 }
290 else {
291 $file_out = "output.tsv";
292 }
293
294
295 ###############################################################################################################################
296 # Print the relevant file names to the screen
297 ###############################################################################################################################
298 # print "\nData file: $data_in\nFASTA file: $fasta_in\nSpecies: $species\nOutput file: $motifs_out\n\n";
299 print "\n--- parameters:\n";
300 print "Data file: $file_in\nAverage or sum identical p-sites? $average_or_sum\nOutput file: $file_out\nMelted map: $file_melt\n";
301 if ($use_sqlite == 0) {
302 print "Motifs file: $motifs_in\nNetworKIN file: networkin_in\nPhosphosite kinase substrate data: $PSP_Kinase_Substrate_in\nPhosphosite regulatory site data: $PSP_Regulatory_Sites_in\nUniProtKB/SwissProt FASTA file: $fasta_in\nOutput SQLite file: $db_out\n";
303 } else {
304 print "Motifs file: $motifs_in\nNetworKIN file: networkin_in\nPhosphosite kinase substrate data: $PSP_Kinase_Substrate_in\nPhosphosite regulatory site data: $PSP_Regulatory_Sites_in\nUniProtKB/SwissProt SQLIte file: $dbfile\nOutput SQLite file: $db_out\n";
305 }
306 print "...\n\n";
307
308 print "Phospho-residues(s) = $phospho_type\n\n";
309 if ($phospho_type ne 'y') {
310 if ($phospho_type ne 'sty') {
311 die "\nUsage error:\nYou must choose a phospho-type, either y or sty\n\n";
312 }
313 }
314
315 ###############################################################################################################################
316 # read the input data file
317 # average or sum identical phospho-sites, depending on the value of $average_or_sum
318 ###############################################################################################################################
319
320 open (IN, "$file_in") or die "I couldn't find the input file: $file_in\n";
321
322 die "\n\nScript died: You must choose either average or sum for \$average_or_sum\n\n" if (($average_or_sum ne "sum") && ($average_or_sum ne "average")) ;
323
324
325 $line = 0;
326
327 while (<IN>) {
328 chomp;
329 my @x = split(/\t/);
330 for my $n (0 .. $#x) {$x[$n] =~ s/\r//g; $x[$n] =~ s/\n//g; $x[$n] =~ s/\"//g;}
331
332 # Read in the samples
333 if ($line == 0) {
334 for my $n (1 .. $#x) {
335 push (@samples, $x[$n]);
336 $sample_id_lut{$x[$n]} = $n;
337 }
338 $line++;
339 } else {
340 # check whether we have already seen a phospho-peptide
341 if (exists($data{$x[0]})) {
342 if ($average_or_sum eq "sum") { # add the data
343 # unload the data
344 @tmp_data = (); foreach (@{$data{$x[0]}}) { push(@tmp_data, $_); }
345 # add the new data and repack
346 for my $k (0 .. $#tmp_data) { $tmp_data[$k] = $tmp_data[$k] + $x[$k+1]; }
347 $all_data{$x[0]} = (); for my $k (0 .. $#tmp_data) { push(@{$all_data{$x[0]}}, $tmp_data[$k]); }
348
349 } elsif ($average_or_sum eq "average") { # average the data
350 # unload the data
351 @tmp_data = (); foreach (@{$all_data{$x[0]}}) { push(@tmp_data, $_); }
352 # average with the new data and repack
353 for my $k (0 .. $#tmp_data) { $tmp_data[$k] = ( $tmp_data[$k]*$n{$x[0]} + $x[0] ) / ($n{$x[0]} + 1); }
354 $n{$x[0]}++;
355 $data{$x[0]} = (); for my $k (0 .. $#tmp_data) { push(@{$data{$x[0]}}, $tmp_data[$k]); }
356 }
357 }
358 # if the phospho-sequence has not been seen, save the data
359 else {
360 for my $k (1 .. $#x) { push(@{$data{$x[0]}}, $x[$k]); }
361 $n{$x[0]} = 1;
362 }
363 }
364 }
365 close(IN);
366
367
368 ###############################################################################################################################
369 # Search the FASTA database for phospho-sites and motifs
370 #
371 # based on Retrieve_p_peptide_motifs_v2.pl
372 ###############################################################################################################################
373
374
375 ###############################################################################################################################
376 #
377 # Read in the Data file:
378 # 1) make @p_peptides array as in the original file
379 # 2) make @non_p_peptides array w/o residue modifications (p, #, other)
380 #
381 ###############################################################################################################################
382
383 foreach my $peptide (keys %data) {
384 $peptide =~ s/s/pS/g; $peptide =~ s/t/pT/g; $peptide =~ s/y/pY/g;
385 push (@p_peptides, $peptide);
386 $peptide =~ s/p//g;
387 push(@non_p_peptides, $peptide);
388 }
389
390 if ($use_sqlite == 0) {
391 ###############################################################################################################################
392 #
393 # Read in the UniProtKB/Swiss-Prot data from FASTA; save to @sequences array and SQLite output database
394 #
395 ###############################################################################################################################
396
397 # e.g.
398 # >sp|Q9Y3B9|RRP15_HUMAN RRP15-like protein OS=Homo sapiens OX=9606 GN=RRP15 PE=1 SV=2
399 # MAAAAPDSRVSEEENLKKTPKKKMKMVTGAVASVLEDEATDTSDSEGSCGSEKDHFYSDD
400 # DAIEADSEGDAEPCDKENENDGESSVGTNMGWADAMAKVLNKKTPESKPTILVKNKKLEK
401 # EKEKLKQERLEKIKQRDKRLEWEMMCRVKPDVVQDKETERNLQRIATRGVVQLFNAVQKH
402 # QKNVDEKVKEAGSSMRKRAKLISTVSKKDFISVLRGMDGSTNETASSRKKPKAKQTEVKS
403 # EEGPGWTILRDDFMMGASMKDWDKESDGPDDSRPESASDSDT
404 # accession: Q9Y3B9
405 # name: RRP15_HUMAN RRP15-like protein OS=Homo sapiens OX=9606 GN=RRP15 PE=1 SV=2
406 # sequence: MAAAAPDSRVSEEENLKKTPKKKMKMVTGAVASVLEDEATDTSDSEGSCGSEKDHFYSDD DAIEADSEGDAEPCDKENENDGESSVGTNMGWADAMAKVLNKKTPESKPTILVKNKKLEK EKEKLKQERLEKIKQRDKRLEWEMMCRVKPDVVQDKETERNLQRIATRGVVQLFNAVQKH QKNVDEKVKEAGSSMRKRAKLISTVSKKDFISVLRGMDGSTNETASSRKKPKAKQTEVKS EEGPGWTILRDDFMMGASMKDWDKESDGPDDSRPESASDSDT
407 #
408 # e.g.
409 # >gi|114939|sp|P00722.2|BGAL_ECOLI Beta-galactosidase (Lactase) cRAP
410 # >gi|52001466|sp|P00366.2|DHE3_BOVIN Glutamate dehydrogenase 1, mitochondrial precursor (GDH) cRAP
411 #
412 # e.g.
413 # >zs|P00009.24.AR-V2_1.zs|zs_peptide_0024_AR-V2_1
414
415
416 open (IN1, "$fasta_in") or die "I couldn't find $fasta_in\n";
417 print "Reading FASTA file $fasta_in\n";
418 # ref: https://perldoc.perl.org/perlsyn#Compound-Statements
419 # "If the condition expression of a while statement is based on any of
420 # a group of iterative expression types then it gets some magic treatment.
421 # The affected iterative expression types are readline, the <FILEHANDLE>
422 # input operator, readdir, glob, the <PATTERN> globbing operator, and
423 # `each`. If the condition expression is one of these expression types,
424 # then the value yielded by the iterative operator will be implicitly
425 # assigned to `$_`."
426 while (<IN1>) {
427 chomp;
428 # ref: https://perldoc.perl.org/functions/split#split-/PATTERN/,EXPR
429 # "If only PATTERN is given, EXPR defaults to $_."
430 my (@x) = split(/\|/);
431 # begin FIX >gi|114939|sp|P00722.2|BGAL_ECOLI Beta-galactosidase (Lactase) cRAP
432 if (@x > 3) {
433 @x = (">".$x[$#x - 2], $x[$#x - 1], $x[$#x]);
434 }
435 # end FIX >gi|114939|sp|P00722.2|BGAL_ECOLI Beta-galactosidase (Lactase) cRAP
436 for my $i (0 .. $#x) {
437 $x[$i] =~ s/\r//g; $x[$i] =~ s/\n//g; $x[$i] =~ s/\"//g; }
438 # Use of uninitialized value $x[0] in pattern match (m//) at /home/rstudio/src/mqppep/tools/mqppep/PhosphoPeptide_Upstream_Kinase_Mapping.pl line 411, <IN1> line 3.
439 if (exists($x[0])) {
440 if ($x[0] =~ /^>/) {
441 # parsing header line
442 $x[0] =~ s/\>//g;
443 push (@databases, $x[0]);
444 push (@accessions, $x[1]);
445 push (@names, $x[2]);
446 # format tags of standard UniProtKB headers as tab-separated values
447 # pseudo_sed produces "UniProt_ID\tDescription\tOS\tOX\tGN\tPE\tSV"
448 $_ = pseudo_sed(join "\t", (">".$x[0], $x[1], $x[2]));
449 # append tab as separator between header and sequence
450 s/$/\t/;
451 # parsed_fasta gets "UniProt_ID\tDescription\tOS\tOX\tGN\tPE\tSV\t"
452 print "push (\@parsed_fasta, $_)\n" if (0 && $x[0] ne "zs");
453 push (@parsed_fasta, $_);
454 } elsif ($x[0] =~ /^\w/) {
455 # line is a portion of the sequence
456 if (defined $sequences[$#accessions]) {
457 $sequences[$#accessions] = $sequences[$#accessions].$x[0];
458 } else {
459 $sequences[$#accessions] = $x[0];
460 }
461 $parsed_fasta[$#accessions] = $parsed_fasta[$#accessions].$x[0];
462 }
463 }
464 }
465 close IN1;
466 print "Done Reading FASTA file $fasta_in\n";
467 $dbfile = $db_out;
468 print "Begin writing $dbfile at " . format_localtime_iso8601() . "\n";
469 $dbh = DBI->connect("dbi:SQLite:$dbfile", undef, undef);
470 my $auto_commit = $dbh->{AutoCommit};
471 print "auto_commit was $auto_commit and is now 0\n" if ($verbose);
472 $dbh->{AutoCommit} = 0;
473
474 # begin DDL-to-SQLite
475 # ---
476 $stmth = $dbh->prepare("
477 DROP TABLE IF EXISTS UniProtKB;
478 ");
479 $stmth->execute();
480
481 $stmth = $dbh->prepare("
482 CREATE TABLE UniProtKB (
483 Uniprot_ID TEXT PRIMARY KEY ON CONFLICT IGNORE,
484 Description TEXT,
485 Organism_Name TEXT,
486 Organism_ID INTEGER,
487 Gene_Name TEXT,
488 PE TEXT,
489 SV TEXT,
490 Sequence TEXT,
491 Database TEXT
492 )
493 ");
494 $stmth->execute();
495 $stmth = $dbh->prepare("
496 CREATE UNIQUE INDEX idx_uniq_UniProtKB_0 on UniProtKB(Uniprot_ID);
497 ");
498 $stmth->execute();
499 $stmth = $dbh->prepare("
500 CREATE INDEX idx_UniProtKB_0 on UniProtKB(Gene_Name);
501 ");
502 $stmth->execute();
503 # ...
504 # end DDL-to-SQLite
505
506 # insert all rows
507 # begin store-to-SQLite "UniProtKB" table
508 # ---
509 $stmth = $dbh->prepare("
510 INSERT INTO UniProtKB (
511 Uniprot_ID,
512 Description,
513 Organism_Name,
514 Organism_ID,
515 Gene_Name,
516 PE,
517 SV,
518 Sequence,
519 Database
520 ) VALUES (?,?,?,?,?,?,?,?,?)
521 ");
522 my $row_count = 1;
523 my $row_string;
524 my (@row, @rows);
525 my $wrd;
526 while ( scalar @parsed_fasta > 0 ) {
527 $database = $databases[$#parsed_fasta];
528 # row_string gets "UniProt_ID\tDescription\tOS\tOX\tGN\tPE\tSV\t"
529 # 1 2 3 4 5 6 7 sequence database
530 $row_string = pop(@parsed_fasta);
531 @row = (split /\t/, $row_string);
532 if ((not exists($row[4])) || ($row[4] eq "")) {
533 die("invalid fasta line\n$row_string\n");
534 };
535 if ($row[4] eq "N/A") {
536 print "Organism_ID is 'N/A' for row $row_count:\n'$row_string'\n";
537 $row[4] = -1;
538 };
539 for $i (1..3,5..8) {
540 #BIND print "bind_param $i, $row[$i]\n";
541 $stmth->bind_param($i, $row[$i]);
542 }
543 #BIND print "bind_param 4, $row[4]\n";
544 $stmth->bind_param(9, $database);
545 #BIND print "bind_param 4, $row[4]\n";
546 $stmth->bind_param(4, $row[4], { TYPE => SQL_INTEGER });
547 if (not $stmth->execute()) {
548 print "Error in row $row_count: " . $dbh->errstr . "\n";
549 print "Row $row_count: $row_string\n";
550 print "Row $row_count: " . ($row_string =~ s/\t/@/g) . "\n";
551 }
552 if (0 && $database ne "zs") {
553 print "row_count: $row_count\n";
554 #### print "row_string: $row_string\n";
555 print "Row $row_count: $row_string\n";
556 for $i (1..3,5..8) {
557 print "bind_param $i, $row[$i]\n" if (exists($row[$i]));
558 }
559 print "bind_param 4, $row[4]\n" if (exists($row[4]));
560 print "bind_param 9, $database\n";
561 };
562 $row_count += 1;
563 }
564 # ...
565 # end store-to-SQLite "UniProtKB" table
566
567 print "begin commit at " . format_localtime_iso8601() . "\n";
568 $dbh->{AutoCommit} = $auto_commit;
569 print "auto_commit is now $auto_commit\n" if ($verbose);
570 $dbh->disconnect if ( defined $dbh );
571 print "Finished writing $dbfile at " . format_localtime_iso8601() . "\n\n";
572 $dbtype = "FASTA";
573 }
574
575 if ($use_sqlite == 1) {
576 ###############################################################################################################################
577 #
578 # Read in the UniProtKB/Swiss-Prot data from SQLite; save to @sequences array
579 #
580 ###############################################################################################################################
581
582 copy($dbfile, $db_out) or die "Copy $dbfile to $db_out failed: $!";
583
584 # https://metacpan.org/pod/DBD::SQLite#Read-Only-Database
585 $dbh = DBI->connect("dbi:SQLite:$dbfile", undef, undef, {
586 sqlite_open_flags => SQLITE_OPEN_READONLY,
587 });
588 print "DB connection $dbh is to $dbfile\n";
589
590 # Uniprot_ID, Description, Organism_Name, Organism_ID, Gene_Name, PE, SV, Sequence
591 $stmth = $dbh->prepare("
592 SELECT Uniprot_ID
593 , Description
594 || CASE WHEN Organism_Name = 'N/A' THEN '' ELSE ' OS=' || Organism_Name END
595 || CASE WHEN Organism_ID = -1 THEN '' ELSE ' OX=' || Organism_ID END
596 || CASE WHEN Gene_Name = 'N/A' THEN '' ELSE ' GN=' || Gene_Name END
597 || CASE WHEN PE = 'N/A' THEN '' ELSE ' PE=' || PE END
598 || CASE WHEN SV = 'N/A' THEN '' ELSE ' SV=' || SV END
599 AS Description
600 , Sequence
601 , Database
602 FROM
603 UniProtKB
604 ");
605 $stmth->execute();
606 @col_names = @{$stmth->{NAME}};
607 print "\nColumn names selected from UniProtKB SQLite table: " . join(", ", @col_names) . "\n\n" if ($verbose);
608 while (my @row = $stmth->fetchrow_array) {
609 push (@names, $row[1]); # redacted Description
610 push (@accessions, $row[0]); # Uniprot_ID
611 $sequences[$#accessions] = $row[2]; # Sequence
612 push (@databases, $row[3]); # Database (should be 'sp')
613 }
614
615 $dbh->disconnect if ( defined $dbh );
616
617 print "Done Reading UniProtKB/Swiss-Prot file $dbfile\n\n";
618 $dbtype = "SQLite";
619 }
620
621 print "$#accessions accessions were read from the UniProtKB/Swiss-Prot $dbtype file\n";
622
623 ######################
624 $dbh = DBI->connect("dbi:SQLite:$dbfile", undef, undef);
625 $stmth = $dbh->prepare("
626 INSERT INTO UniProtKB (
627 Uniprot_ID,
628 Description,
629 Organism_Name,
630 Organism_ID,
631 Gene_Name,
632 PE,
633 SV,
634 Sequence,
635 Database
636 ) VALUES (
637 'No Uniprot_ID',
638 'NO_GENE_SYMBOL No Description',
639 'No Organism_Name',
640 0,
641 '$FAILED_MATCH_GENE_NAME',
642 '0',
643 '0',
644 '$FAILED_MATCH_SEQ',
645 'No Database'
646 )
647 ");
648 if (not $stmth->execute()) {
649 print "Error inserting dummy row into UniProtKB: $stmth->errstr\n";
650 }
651 $dbh->disconnect if ( defined $dbh );
652 ######################
653
654 @timeData = localtime(time);
655 print "\n--- Start search at " . format_localtime_iso8601() ."\n";
656
657 print " --> Calling 'search_ppep' script\n\n";
658 if ($verbose) {
659 $i = system("python $dirname/search_ppep.py -u $db_out -p $file_in --verbose");
660 } else {
661 $i = system("python $dirname/search_ppep.py -u $db_out -p $file_in");
662 }
663 if ($i) {
664 print "python $dirname/search_ppep.py -u $db_out -p $file_in\n exited with exit code $i\n";
665 die "Search failed for phosphopeptides in SwissProt/SQLite file.";
666 }
667 print " <-- Returned from 'search_ppep' script\n";
668
669 @timeData = localtime(time);
670 print "... Finished search at " . format_localtime_iso8601() ."\n\n";
671
672
673 ###############################################################################################################################
674 #
675 # Match the non_p_peptides to the @sequences array:
676 # 1) Format the motifs +/- 10 residues around the phospho-site
677 # 2) Print the original data plus the phospho-motif to the output file
678 #
679 ###############################################################################################################################
680
681
682 print "--- Match the non_p_peptides to the \@sequences array:\n";
683
684 if ($USE_SEARCH_PPEP_PY) {
685 print "Find the matching protein sequence(s) for the peptide using SQLite\n";
686 } else {
687 print "Find the matching protein sequence(s) for the peptide using slow search\n";
688 }
689
690 # https://metacpan.org/pod/DBD::SQLite#Read-Only-Database
691 $dbh = DBI->connect("dbi:SQLite:$db_out", undef, undef, {
692 sqlite_open_flags => SQLITE_OPEN_READONLY,
693 });
694 print "DB connection $dbh is to $db_out\n";
695
696 # CREATE VIEW uniprotid_pep_ppep AS
697 # SELECT deppep_UniProtKB.UniprotKB_ID AS accession
698 # , deppep.seq AS peptide
699 # , ppep.seq AS phosphopeptide
700 # , UniProtKB.Sequence AS sequence
701 # , UniProtKB.Description AS description
702 # FROM ppep, deppep, deppep_UniProtKB, UniProtKB
703 # WHERE deppep.id = ppep.deppep_id
704 # AND deppep.id = deppep_UniProtKB.deppep_id
705 # AND deppep_UniProtKB.UniprotKB_ID = UniProtKB.Uniprot_ID
706 # ORDER BY UniprotKB_ID, deppep.seq, ppep.seq;
707
708 my %ppep_to_count_lut;
709 print "start select peptide counts " . format_localtime_iso8601() . "\n";
710 my $uniprotkb_pep_ppep_view_stmth = $dbh->prepare("
711 SELECT DISTINCT
712 phosphopeptide
713 , count(*) as i
714 FROM
715 uniprotkb_pep_ppep_view
716 GROUP BY
717 phosphopeptide
718 ORDER BY
719 phosphopeptide
720 ");
721 if (not $uniprotkb_pep_ppep_view_stmth->execute()) {
722 die "Error fetching peptide counts: $uniprotkb_pep_ppep_view_stmth->errstr\n";
723 }
724 while (my @row = $uniprotkb_pep_ppep_view_stmth->fetchrow_array) {
725 $ppep_to_count_lut{$row[0]} = $row[1];
726 #print "\$ppep_to_count_lut{$row[0]} = $ppep_to_count_lut{$row[0]}\n";
727 }
728
729 # accession, peptide, sequence, description, phosphopeptide, long_description, pos_start, pos_end, scrubbed, ppep_id
730 # 0 1 2 3 4 5 6 7 8 9
731 my $COL_ACCESSION = 0;
732 my $COL_PEPTIDE = 1;
733 my $COL_SEQUENCE = 2;
734 my $COL_DESCRIPTION = 3;
735 my $COL_PHOSPHOPEPTIDE = 4;
736 my $COL_LONG_DESCRIPTION = 5;
737 my $COL_POS_START = 6;
738 my $COL_POS_END = 7;
739 my $COL_SCRUBBED = 8;
740 my $COL_PPEP_ID = 9;
741
742 my %ppep_to_row_lut;
743 print "start select all records without qualification " . format_localtime_iso8601() . "\n";
744 $uniprotkb_pep_ppep_view_stmth = $dbh->prepare("
745 SELECT DISTINCT
746 accession
747 , peptide
748 , sequence
749 , description
750 , phosphopeptide
751 , long_description
752 , pos_start
753 , pos_end
754 , scrubbed
755 , ppep_id
756 FROM
757 uniprotkb_pep_ppep_view
758 ORDER BY
759 phosphopeptide
760 ");
761 if (not $uniprotkb_pep_ppep_view_stmth->execute()) {
762 die "Error fetching all records without qualification: $uniprotkb_pep_ppep_view_stmth->errstr\n";
763 }
764 my $current_ppep;
765 my $counter = 0;
766 my $former_ppep = "";
767 @tmp_matches = ();
768 @tmp_accessions = ();
769 @tmp_names = ();
770 @tmp_sites = ();
771 while (my @row = $uniprotkb_pep_ppep_view_stmth->fetchrow_array) {
772 # Identify phosphopeptide for current row;
773 # it is an error for it to change when the counter is not zero.
774 $current_ppep = $row[$COL_PHOSPHOPEPTIDE];
775
776 # when counter is zero, prepare for a new phosphopeptide
777 if (not $current_ppep eq $former_ppep) {
778 die "counter is $counter instead of zero" if ($counter != 0);
779 $ppep_id_lut{$current_ppep} = $row[$COL_PPEP_ID];
780 print "next phosphpepetide: $current_ppep; id: $ppep_id_lut{$current_ppep}\n" if ($verbose);
781 $counter = $ppep_to_count_lut{$current_ppep};
782 @tmp_matches = ();
783 @tmp_accessions = ();
784 @tmp_names = ();
785 @tmp_sites = ();
786 }
787
788 if ($USE_SEARCH_PPEP_PY) {
789 push(@tmp_matches, $row[ $COL_SEQUENCE ]);
790 push(@tmp_accessions, $row[ $COL_ACCESSION ]);
791 push(@tmp_names, $row[ $COL_LONG_DESCRIPTION ]);
792 push(@tmp_sites, $row[ $COL_POS_START ]);
793 }
794
795 # Prepare counter and phosphopeptide tracker for next row
796 $former_ppep = $current_ppep;
797 $counter -= 1;
798
799 # Set trackers for later use after last instance of current phosphopeptide
800 if ($counter == 0) {
801 if ($USE_SEARCH_PPEP_PY) {
802 $matched_sequences{$current_ppep} = [ @tmp_matches ];
803 $accessions{ $current_ppep} = [ @tmp_accessions ];
804 $names{ $current_ppep} = [ @tmp_names ];
805 $sites{ $current_ppep} = [ @tmp_sites ];
806 }
807 }
808 }
809
810
811 print "end select all records without qualification " . format_localtime_iso8601() . "\n";
812
813 for my $j (0 .. $#p_peptides) {
814
815 #Find the matching protein sequence(s) for the peptide using SQLite
816 my ($site, $sequence);
817 my (@row, @rows);
818 my $match = 0;
819 my $p_peptide = $p_peptides[$j];
820 @tmp_matches = ();
821 @tmp_accessions = ();
822 @tmp_names = ();
823 @tmp_sites = ();
824
825 #Find the matching protein sequence(s) for the peptide using slow search
826 $site = -1;
827 unless ($USE_SEARCH_PPEP_PY) {
828 for my $k (0 .. $#sequences) {
829 $site = index($sequences[$k], $non_p_peptides[$j]);
830 if ($site != -1) {
831 push(@tmp_matches, $sequences[$k]);
832 push(@tmp_accessions, $accessions[$k]);
833 push(@tmp_names, $names[$k]);
834 push(@tmp_sites, $site);
835 }
836 # print "Non-phosphpeptide $non_p_peptides[$j] matched accession $accessions[$k] ($names[$k]) at site $site\n";
837 $site = -1; $match++;
838 # print "tmp_accessions @tmp_accessions \n";
839 }
840 if ($match == 0) { # Check to see if no match was found. Skip to next if no match found.
841 print "Warning: Failed match for $p_peptides[$j]\n";
842 $matched_sequences{$p_peptides[$j]} = \@failed_match;
843 push(@failed_matches,$p_peptides[$j]);
844 next;
845 } else {
846 $matched_sequences{$p_peptides[$j]} = [ @tmp_matches ];
847 $accessions{$p_peptides[$j]} = [ @tmp_accessions ];
848 $names{$p_peptides[$j]} = [ @tmp_names ];
849 $sites{$p_peptides[$j]} = [ @tmp_sites ];
850 }
851 }
852
853 } # end for my $j (0 .. $#p_peptides)
854
855 print "... Finished match the non_p_peptides at " . format_localtime_iso8601() ."\n\n";
856
857 print "--- Match the p_peptides to the \@sequences array:\n";
858
859 for my $peptide_to_match ( keys %matched_sequences ) {
860 if (grep($peptide_to_match, @failed_matches)) {
861 print "Failed to match peptide $peptide_to_match\n";
862 }
863 next if (grep($peptide_to_match, @failed_matches));
864 my @matches = @{$matched_sequences{$peptide_to_match}};
865 @tmp_motifs_array = ();
866 for my $i (0 .. $#matches) {
867
868 # Find the location of the phospo-site in the sequence(s)
869 $tmp_site = 0; my $offset = 0;
870 my $tmp_p_peptide = $peptide_to_match;
871 $tmp_p_peptide =~ s/#//g; $tmp_p_peptide =~ s/\d//g; $tmp_p_peptide =~ s/\_//g; $tmp_p_peptide =~ s/\.//g;
872
873 # Find all phosphorylated residues in the p_peptide
874 @p_sites = ();
875 while ($tmp_site != -1) {
876 $tmp_site = index($tmp_p_peptide, 'p', $offset);
877 if ($tmp_site != -1) {push (@p_sites, $tmp_site);}
878 $offset = $tmp_site + 1;
879 $tmp_p_peptide =~ s/p//;
880 }
881 @tmp_p_residues = ();
882 for my $l (0 .. $#p_sites) {
883 next if not defined $sites{$peptide_to_match}[$i];
884
885 push (@tmp_p_residues, $p_sites[$l] + $sites{$peptide_to_match}[$i]);
886
887 # Match the sequences around the phospho residues to find the motifs
888 my ($desired_residues_L, $desired_residues_R);
889 if ($tmp_p_residues[0] - 10 < 0) { #check to see if there are fewer than 10 residues left of the first p-site
890 # eg, XXXpYXX want $desired_residues_L = 3, $p_residues[0] = 3
891 $desired_residues_L = $tmp_p_residues[0];
892 }
893 else {
894 $desired_residues_L = 10;
895 }
896 my $seq_length = length($matched_sequences{$peptide_to_match}[$i]);
897 if ($tmp_p_residues[$#tmp_p_residues] + 10 > $seq_length) { #check to see if there are fewer than 10 residues right of the last p-site
898 $desired_residues_R = $seq_length - ($tmp_p_residues[$#tmp_p_residues] + 1);
899 # eg, XXXpYXX want $desired_residues_R = 2, $seq_length = 6, $p_residues[$#p_residues] = 3
900 # print "Line 170: seq_length = $seq_length\tp_residue = $p_residues[$#p_residues]\n";
901 }
902 else {
903 $desired_residues_R = 10;
904 }
905
906 my $total_length = $desired_residues_L + $tmp_p_residues[$#tmp_p_residues] - $tmp_p_residues[0] + $desired_residues_R + 1;
907 my $arg2 = $tmp_p_residues[0] - $desired_residues_L;
908 my $arg1 = $matched_sequences{$peptide_to_match}[$i];
909
910 if (($total_length > 0) && (length($arg1) > $arg2 + $total_length - 1)) {
911 $tmp_motif = substr($arg1, $arg2, $total_length);
912
913 # Put the "p" back in front of the appropriate phospho-residue(s).
914 my (@tmp_residues, $tmp_position);
915 for my $m (0 .. $#p_sites) {
916 # print "Line 183: $p_sites[$m]\n";
917 if ($m == 0) {
918 $tmp_position = $desired_residues_L;
919 } else {
920 $tmp_position = $desired_residues_L + $p_sites[$m] - $p_sites[0];
921 }
922 if ($tmp_position < length($tmp_motif) + 1) {
923 push (@tmp_residues, substr($tmp_motif, $tmp_position, 1));
924 if ($tmp_residues[$m] eq "S") {substr($tmp_motif, $tmp_position, 1, "s");}
925 if ($tmp_residues[$m] eq "T") {substr($tmp_motif, $tmp_position, 1, "t");}
926 if ($tmp_residues[$m] eq "Y") {substr($tmp_motif, $tmp_position, 1, "y");}
927 }
928 }
929
930 $tmp_motif =~ s/s/pS/g; $tmp_motif =~ s/t/pT/g; $tmp_motif =~ s/y/pY/g;
931
932 # Comment out on 8.10.13 to remove the numbers from motifs
933 my $left_residue = $tmp_p_residues[0] - $desired_residues_L+1;
934 my $right_residue = $tmp_p_residues[$#tmp_p_residues] + $desired_residues_R+1;
935 $tmp_motif = $left_residue."-[ ".$tmp_motif." ]-".$right_residue;
936 push(@tmp_motifs_array, $tmp_motif);
937 $residues{$peptide_to_match}{$i} = [ @tmp_residues ];
938 $p_residues{$peptide_to_match}{$i} = [ @tmp_p_residues ];
939 }
940 }
941 $p_motifs{$peptide_to_match} = [ @tmp_motifs_array ];
942 } # end for my $i (0 .. $#matches) ### this bracket could be in the wrong place
943 }
944
945 print "... Finished match the p_peptides to the \@sequences array at " . format_localtime_iso8601() ."\n\n";
946
947 ###############################################################################################################################
948 #
949 # Annotate the peptides with the NetworKIN predictions and HPRD / Phosida kinase motifs
950 #
951 ###############################################################################################################################
952
953
954 print "--- Reading various site data:\n";
955
956 ###############################################################################################################################
957 #
958 # Read the NetworKIN_predictions file:
959 # 1) make a "kinases_observed" array
960 # 2) annotate the phospho-substrates with the appropriate kinase
961 #
962 ###############################################################################################################################
963 my $SITE_KINASE_SUBSTRATE = 1;
964 $site_description{$SITE_KINASE_SUBSTRATE} = "NetworKIN";
965
966 open (IN1, "$networkin_in") or die "I couldn't find $networkin_in\n";
967 print "Reading the NetworKIN data: $networkin_in\n";
968 while (<IN1>) {
969 chomp;
970 my (@x) = split(/\t/);
971 for my $i (0 .. $#x) {
972 $x[$i] =~ s/\r//g; $x[$i] =~ s/\n//g; $x[$i] =~ s/\"//g;
973 }
974 next if ($x[0] eq "#substrate");
975 if (exists ($kinases -> {$x[2]})) {
976 #do nothing
977 }
978 else {
979 $kinases -> {$x[2]} = $x[2];
980 push (@kinases_observed, $x[2]);
981 }
982 my $tmp = $x[10]."_".$x[2]; #eg, REEILsEMKKV_PKCalpha
983 if (exists($p_sequence_kinase -> {$tmp})) {
984 #do nothing
985 }
986 else {
987 $p_sequence_kinase -> {$tmp} = $tmp;
988 }
989 }
990 close IN1;
991
992 ###############################################################################################################################
993 #
994 # Read the Kinase motifs file:
995 # 1) make a "motif_sequence" array
996 #
997 ###############################################################################################################################
998
999 # file format (tab separated):
1000 # x[0] = quasi-primary key (character), e.g., '17' or '23a'
1001 # x[1] = pattern (egrep pattern), e.g., '(M|I|L|V|F|Y).R..(pS|pT)'
1002 # x[2] = description, e.g., 'PKA_Phosida' or '14-3-3 domain binding motif (HPRD)' or 'Akt kinase substrate motif (HPRD & Phosida)'
1003 # "counter" "pcre" "symbol" "description" "pubmed_id" "classification" "source"
1004 # "1" "R.R..(pS|pT)(F|L)" "PKB_group" "Akt kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=8985174" "kinase substrate" "HPRD"
1005 # x[3] = old description, i.e., description in Amanchy (HPRD) and Phosida tables
1006 # x[4] = pubmed id
1007 # x[5] = classification
1008 # x[6] = source (Phosida or HPRD)
1009 my $SITE_HPRD = 2;
1010 $site_description{$SITE_HPRD} = "HPRD";
1011 $site_id{$site_description{$SITE_HPRD}} = $SITE_HPRD;
1012 my $SITE_PHOSIDA = 4;
1013 $site_description{$SITE_PHOSIDA} = "Phosida";
1014 $site_id{$site_description{$SITE_PHOSIDA}} = $SITE_PHOSIDA;
1015
1016 open (IN2, "$motifs_in") or die "I couldn't find $motifs_in\n";
1017 print "Reading the Motifs file: $motifs_in\n";
1018
1019 while (<IN2>) {
1020 chomp;
1021 my (@x) = split(/\t/);
1022 my $tmp_motif_description;
1023 if ($#x == 6) { # weirdly, a @list of length seven has $#list == 6
1024 # remove double-quotes which are helpful or necessary for Excel
1025 $x[6] =~ s/\"//g;
1026 $tmp_motif_description = $x[6];
1027 } else {
1028 $tmp_motif_description = "motif";
1029 }
1030 for my $i (0 .. 2) {
1031 # remove any embedded CR or LF (none should exist)
1032 $x[$i] =~ s/\r//g;
1033 $x[$i] =~ s/\n//g;
1034 # remove double-quotes which are helpful or necessary for Excel
1035 $x[$i] =~ s/\"//g;
1036 }
1037 if (exists ($motif_type{$x[2]})) {
1038 #ACE-2022.06.20 $motif_type{$x[1]} = $motif_type{$x[1]}." & ".$x[2];
1039 $motif_type{$x[2]} = $motif_type{$x[2]}."|".$x[2];
1040 } else {
1041 $motif_type{$x[2]} = $x[2];
1042 $motif_count{$x[1]} = 0;
1043 push (@motif_sequence, $x[1]);
1044 push (@motif_description, $tmp_motif_description);
1045 push (@motif_type_key_ary, $x[2])
1046 }
1047 }
1048 close (IN2);
1049
1050
1051 ###############################################################################################################################
1052 # 6.28.2011
1053 # Read PSP_Kinase_Substrate data:
1054 # 1) make a "kinases_PhosphoSite" array
1055 # 2) annotate the phospho-substrates with the appropriate kinase
1056 #
1057 # Columns:
1058 # (0) GENE
1059 # (1) KINASE
1060 # (2) KIN_ACC_ID
1061 # (3) KIN_ORGANISM
1062 # (4) SUBSTRATE
1063 # (5) SUB_GENE_ID
1064 # (6) SUB_ACC_ID
1065 # (7) SUB_GENE
1066 # (8) SUB_ORGANISM
1067 # (9) SUB_MOD_RSD
1068 # (10) SITE_GRP_ID
1069 # (11) SITE_+/-7_AA
1070 # (12) DOMAIN
1071 # (13) IN_VIVO_RXN
1072 # (14) IN_VITRO_RXN
1073 # (15) CST_CAT#
1074 ###############################################################################################################################
1075
1076 my $SITE_PHOSPHOSITE = 3;
1077 $site_description{$SITE_PHOSPHOSITE} = "PhosphoSite";
1078
1079
1080 $line = 0;
1081
1082 open (IN3, "$PSP_Kinase_Substrate_in") or die "I couldn't find $PSP_Kinase_Substrate_in\n";
1083 print "Reading the PhosphoSite Kinase-Substrate data: $PSP_Kinase_Substrate_in\n";
1084
1085 while (<IN3>) {
1086 chomp;
1087 my (@x) = split(/\t/);
1088 for my $i (0 .. $#x) {
1089 $x[$i] =~ s/\r//g; $x[$i] =~ s/\n//g; $x[$i] =~ s/\"//g;
1090 }
1091 if ($line != 0) {
1092 if (($species eq $x[3]) && ($species eq $x[8])) {
1093 if (exists ($kinases_PhosphoSite -> {$x[0]})) {
1094 #do nothing
1095 }
1096 else {
1097 $kinases_PhosphoSite -> {$x[0]} = $x[0];
1098 push (@kinases_PhosphoSite, $x[0]);
1099 }
1100 my $offset = 0;
1101 # Replace the superfluous lower case s, t and y
1102 my @lowercase = ('s','t','y');
1103 my @uppercase = ('S','T','Y');
1104 for my $k (0 .. 2) {
1105 my $site = 0;
1106 while ($site != -1) {
1107 $site = index($x[11],$lowercase[$k], $offset);
1108 if (($site != 7) && ($site != -1)) {substr($x[11], $site, 1, $uppercase[$k]);}
1109 $offset = $site + 1;
1110 }
1111 }
1112 my $tmp = $x[11]."_".$x[0]; #eg, RTPGRPLsSYGMDSR_PAK2
1113 if (exists($p_sequence_kinase_PhosphoSite -> {$tmp})) {
1114 #do nothing
1115 }
1116 else {
1117 $p_sequence_kinase_PhosphoSite -> {$tmp} = $tmp;
1118 }
1119 }
1120 else {
1121 # do nothing
1122 #print "PSP_kinase_substrate line rejected because KIN_ORGANISM is '$x[3]' and SUB_ORGANISM is '$x[8]': $line\n";
1123 }
1124 }
1125 $line++;
1126 }
1127 close IN3;
1128
1129
1130 ###############################################################################################################################
1131 # Read PhosphoSite regulatory site data:
1132 # 1) make a "regulatory_sites_PhosphoSite" hash
1133 #
1134 # Columns:
1135 # (0) GENE
1136 # (2) PROT_TYPE
1137 # (3) ACC_ID
1138 # (4) GENE_ID
1139 # (5) HU_CHR_LOC
1140 # (6) ORGANISM --> %organism
1141 # (7) MOD_RSD
1142 # (8) SITE_GRP_ID
1143 # (9) SITE_+/-7_AA --> %regulatory_sites_PhosphoSite_hash
1144 # (10) DOMAIN --> %domain
1145 # (11) ON_FUNCTION --> %ON_FUNCTION
1146 # (12) ON_PROCESS --> %ON_PROCESS
1147 # (13) ON_PROT_INTERACT --> %ON_PROT_INTERACT
1148 # (14) ON_OTHER_INTERACT --> %ON_OTHER_INTERACT
1149 # (15) PMIDs
1150 # (16) LT_LIT
1151 # (17) MS_LIT
1152 # (18) MS_CST
1153 # (19) NOTES --> %notes
1154 ###############################################################################################################################
1155
1156
1157 $dbh = DBI->connect("dbi:SQLite:$db_out", undef, undef);
1158 my $auto_commit = $dbh->{AutoCommit};
1159 $dbh->{AutoCommit} = 0;
1160 print "DB connection $dbh is to $db_out, opened for modification\n";
1161
1162 # add partial PSP_Regulatory_site table (if not exists) regardless of whether SwissProt input was FASTA or SQLite
1163 $stmth = $dbh->prepare("
1164 CREATE TABLE IF NOT EXISTS PSP_Regulatory_site (
1165 SITE_PLUSMINUS_7AA TEXT PRIMARY KEY ON CONFLICT IGNORE,
1166 DOMAIN TEXT,
1167 ON_FUNCTION TEXT,
1168 ON_PROCESS TEXT,
1169 ON_PROT_INTERACT TEXT,
1170 ON_OTHER_INTERACT TEXT,
1171 NOTES TEXT,
1172 ORGANISM TEXT,
1173 PROTEIN TEXT
1174 )
1175 ");
1176 $stmth->execute();
1177
1178 # add partial PSP_Regulatory_site LUT (if not exists) regardless of whether SwissProt input was FASTA or SQLite
1179 $stmth = $dbh->prepare("
1180 CREATE TABLE IF NOT EXISTS ppep_regsite_LUT
1181 ( ppep_id INTEGER REFERENCES ppep(id)
1182 , site_plusminus_7AA TEXT REFERENCES PSP_Regulatory_site(site_plusminus_7AA)
1183 , PRIMARY KEY (ppep_id, site_plusminus_7AA) ON CONFLICT IGNORE
1184 );
1185 ");
1186 $stmth->execute();
1187
1188 # $stmth = $dbh->prepare("
1189 # CREATE UNIQUE INDEX idx_PSP_Regulatory_site_0
1190 # ON PSP_Regulatory_site(site_plusminus_7AA);
1191 # ");
1192 # $stmth->execute();
1193
1194
1195 # add Citation table (if not exists) regardless of whether SwissProt input was FASTA or SQLite
1196 my $citation_sql;
1197 $citation_sql = "
1198 CREATE TABLE IF NOT EXISTS Citation (
1199 ObjectName TEXT REFERENCES sqlite_schema(name) ON DELETE CASCADE,
1200 CitationData TEXT,
1201 PRIMARY KEY (ObjectName, CitationData) ON CONFLICT IGNORE
1202 )
1203 ";
1204 $stmth = $dbh->prepare($citation_sql);
1205 $stmth->execute();
1206
1207
1208 open (IN4, "$PSP_Regulatory_Sites_in") or die "I couldn't find $PSP_Regulatory_Sites_in\n";
1209 print "Reading the PhosphoSite regulatory site data: $PSP_Regulatory_Sites_in\n";
1210
1211
1212 $line = -1;
1213 while (<IN4>) {
1214 $line++;
1215 chomp;
1216 if ($_ =~ m/PhosphoSitePlus/) {
1217 #$PhosphoSitePlusCitation = ($_ =~ s/PhosphoSitePlus/FooBar/g);
1218 $PhosphoSitePlusCitation = $_;
1219 $PhosphoSitePlusCitation =~ s/\t//g;
1220 $PhosphoSitePlusCitation =~ s/\r//g;
1221 $PhosphoSitePlusCitation =~ s/\n//g;
1222 $PhosphoSitePlusCitation =~ s/""/"/g;
1223 $PhosphoSitePlusCitation =~ s/^"//g;
1224 $PhosphoSitePlusCitation =~ s/"$//g;
1225 print "$PhosphoSitePlusCitation\n";
1226 next;
1227 }
1228 my (@x) = split(/\t/);
1229 for my $i (0 .. $#x) {
1230 $x[$i] =~ s/\r//g; $x[$i] =~ s/\n//g; $x[$i] =~ s/\"//g;
1231 }
1232 my $found_GENE=0;
1233 if ( (not exists($x[0])) ) {
1234 next;
1235 }
1236 elsif ( ($x[0] eq "GENE") ) {
1237 $found_GENE=1;
1238 next;
1239 }
1240 if ( (not exists($x[9])) || ($x[9] eq "") ) {
1241 if (exists($x[8]) && (not $x[8] eq "")) {
1242 die "$PSP_Regulatory_Sites_in line $line has no SITE_+/-7_AA: $_\n";
1243 } else {
1244 if ( (not exists($x[1])) || (not $x[1] eq "") ) {
1245 print "$PSP_Regulatory_Sites_in line $line (".length($_)." characters) has no SITE_+/-7_AA: $_\n"
1246 if $found_GENE==1;
1247 }
1248 next;
1249 }
1250 }
1251 elsif ($line != 0) {
1252 if ($species ne $x[6]) {
1253 # Do nothing - this record was filtered out by the species filter
1254 }
1255 elsif (!exists($regulatory_sites_PhosphoSite_hash{$x[9]})) {
1256 if (!defined $domain{$x[9]} || $domain{$x[9]} eq "") {
1257 $regulatory_sites_PhosphoSite_hash{$x[9]} = $x[9];
1258 $domain{$x[9]} = $x[10];
1259 $ON_FUNCTION{$x[9]} = $x[11];
1260 $ON_PROCESS{$x[9]} = $x[12];
1261 $ON_PROT_INTERACT{$x[9]} = $x[13];
1262 $ON_OTHER_INTERACT{$x[9]} = $x[14];
1263 $notes{$x[9]} = $x[19];
1264 $organism{$x[9]} = $x[6];
1265 }
1266 }
1267 else {
1268 # $domain
1269 if (!defined $domain{$x[9]} || $domain{$x[9]} eq "") {
1270 if ($x[10] ne "") {
1271 $domain{$x[9]} = $domain{$x[10]};
1272 }
1273 else {
1274 # do nothing
1275 }
1276 }
1277 else {
1278 if ($domain{$x[9]} =~ /$x[10]/) {
1279 # do nothing
1280 }
1281 else {
1282 $domain{$x[9]} = $domain{$x[9]}." / ".$x[10];
1283 #print "INFO line $line - compound domain for 7aa: GENE $x[0] PROTEIN $x[1] PROT_TYPE $x[2] ACC_ID $x[3] GENE_ID $x[4] HU_CHR_LOC $x[5] ORGANISM $x[6] MOD_RSD $x[7] SITE_GRP_ID $x[8] SITE_+/-7_AA $x[9] DOMAIN $domain{$x[9]}\n";
1284 }
1285 }
1286
1287 # $ON_FUNCTION
1288 if (!defined $ON_FUNCTION{$x[9]} || $ON_FUNCTION{$x[9]} eq "") {
1289 $ON_FUNCTION{$x[9]} = $ON_FUNCTION{$x[10]};
1290 } elsif ($x[10] eq "") {
1291 # do nothing
1292 }
1293 else {
1294 $ON_FUNCTION{$x[9]} = $ON_FUNCTION{$x[9]}." / ".$x[10];
1295 }
1296
1297 # $ON_PROCESS
1298 if (!defined $ON_PROCESS{$x[9]} || $ON_PROCESS{$x[9]} eq "") {
1299 $ON_PROCESS{$x[9]} = $ON_PROCESS{$x[10]};
1300 } elsif ($x[10] eq "") {
1301 # do nothing
1302 }
1303 else {
1304 $ON_PROCESS{$x[9]} = $ON_PROCESS{$x[9]}." / ".$x[10];
1305 }
1306
1307 # $ON_PROT_INTERACT
1308 if (!defined $ON_PROT_INTERACT{$x[9]} || $ON_PROT_INTERACT{$x[9]} eq "") {
1309 $ON_PROT_INTERACT{$x[9]} = $ON_PROT_INTERACT{$x[10]};
1310 } elsif ($x[10] eq "") {
1311 # do nothing
1312 }
1313 else {
1314 $ON_PROT_INTERACT{$x[9]} = $ON_PROT_INTERACT{$x[9]}." / ".$x[10];
1315 }
1316
1317 # $ON_OTHER_INTERACT
1318 if (!defined $ON_OTHER_INTERACT{$x[9]} || $ON_OTHER_INTERACT{$x[9]} eq "") {
1319 $ON_OTHER_INTERACT{$x[9]} = $ON_OTHER_INTERACT{$x[10]};
1320 } elsif ($x[10] eq "") {
1321 # do nothing
1322 }
1323 else {
1324 $ON_OTHER_INTERACT{$x[9]} = $ON_OTHER_INTERACT{$x[9]}." / ".$x[10];
1325 }
1326
1327 # $notes
1328 if (!defined $notes{$x[9]} || $notes{$x[9]} eq "") {
1329 $notes{$x[9]} = $notes{$x[10]};
1330 } elsif ($x[10] eq "") {
1331 # do nothing
1332 }
1333 else {
1334 $notes{$x[9]} = $notes{$x[9]}." / ".$x[10];
1335 }
1336
1337 # $organism
1338 if (!defined $organism{$x[9]} || $organism{$x[9]} eq "") {
1339 $organism{$x[9]} = $organism{$x[10]};
1340 } elsif ($x[10] eq "") {
1341 # do nothing
1342 }
1343 else {
1344 $organism{$x[9]} = $organism{$x[9]}." / ".$x[10];
1345 }
1346 }
1347 }
1348 }
1349 close IN4;
1350
1351 print "... Finished reading various site data at " . format_localtime_iso8601() ."\n\n";
1352
1353 $stmth = $dbh->prepare("
1354 INSERT INTO Citation (
1355 ObjectName,
1356 CitationData
1357 ) VALUES (?,?)
1358 ");
1359
1360 sub add_citation {
1361 my ($cit_table, $cit_text, $cit_label) = @_;
1362 $stmth->bind_param(1, $cit_table);
1363 $stmth->bind_param(2, $cit_text);
1364 if (not $stmth->execute()) {
1365 print "Error writing $cit_label cit for table $cit_table: $stmth->errstr\n";
1366 }
1367 }
1368 my ($citation_text, $citation_table);
1369
1370 # PSP regulatory or kinase/substrate site
1371 $citation_text = 'PhosphoSitePlus(R) (PSP) was created by Cell Signaling Technology Inc. It is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License. When using PSP data or analyses in printed publications or in online resources, the following acknowledgements must be included: (a) the words "PhosphoSitePlus(R), www.phosphosite.org" must be included at appropriate places in the text or webpage, and (b) the following citation must be included in the bibliography: "Hornbeck PV, Zhang B, Murray B, Kornhauser JM, Latham V, Skrzypek E PhosphoSitePlus, 2014: mutations, PTMs and recalibrations. Nucleic Acids Res. 2015 43:D512-20. PMID: 25514926."';
1372 $citation_table = "PSP_Regulatory_site";
1373 add_citation($citation_table, $citation_text, "PSP_Kinase_Substrate");
1374 $citation_table = "psp_gene_site";
1375 add_citation($citation_table, $citation_text, "PSP_Kinase_Substrate");
1376 $citation_table = "psp_gene_site_view";
1377 add_citation($citation_table, $citation_text, "PSP_Regulatory_site");
1378 $citation_text = 'Hornbeck, 2014, "PhosphoSitePlus, 2014: mutations, PTMs and recalibrations.", https://pubmed.ncbi.nlm.nih.gov/22135298, https://doi.org/10.1093/nar/gkr1122';
1379 $citation_table = "PSP_Regulatory_site";
1380 add_citation($citation_table, $citation_text, "PSP_Regulatory_site");
1381 $citation_table = "psp_gene_site";
1382 add_citation($citation_table, $citation_text, "PSP_Kinase_Substrate");
1383 $citation_table = "psp_gene_site_view";
1384 add_citation($citation_table, $citation_text, "PSP_Kinase_Substrate");
1385
1386 # NetworKIN site
1387 $citation_text = 'Linding, 2007, "Systematic discovery of in vivo phosphorylation networks.", https://pubmed.ncbi.nlm.nih.gov/17570479, https://doi.org/10.1016/j.cell.2007.05.052';
1388 $citation_table = "psp_gene_site";
1389 add_citation($citation_table, $citation_text, "NetworkKIN");
1390 $citation_table = "psp_gene_site_view";
1391 add_citation($citation_table, $citation_text, "NetworkKIN");
1392 $citation_text = 'Horn, 2014, "KinomeXplorer: an integrated platform for kinome biology studies.", https://pubmed.ncbi.nlm.nih.gov/24874572, https://doi.org/10.1038/nmeth.296';
1393 $citation_table = "psp_gene_site";
1394 add_citation($citation_table, $citation_text, "NetworkKIN");
1395 $citation_table = "psp_gene_site_view";
1396 add_citation($citation_table, $citation_text, "NetworkKIN");
1397 $citation_text = 'Aken, 2016, "The Ensembl gene annotation system.", https://pubmed.ncbi.nlm.nih.gov/33137190, https://doi.org/10.1093/database/baw093';
1398 $citation_table = "psp_gene_site";
1399 add_citation($citation_table, $citation_text, "NetworkKIN");
1400 $citation_table = "psp_gene_site_view";
1401 add_citation($citation_table, $citation_text, "NetworkKIN");
1402
1403 # pSTY motifs
1404 $citation_text = 'Amanchy, 2007, "A curated compendium of phosphorylation motifs.", https://pubmed.ncbi.nlm.nih.gov/17344875, https://doi.org/10.1038/nbt0307-285';
1405 $citation_table = "psp_gene_site";
1406 add_citation($citation_table, $citation_text, "Amanchy_pSTY_motifs");
1407 $citation_table = "psp_gene_site_view";
1408 add_citation($citation_table, $citation_text, "Amanchy_pSTY_motifs");
1409 $citation_text = 'Gnad, 2011, "PHOSIDA 2011: the posttranslational modification database.", https://pubmed.ncbi.nlm.nih.gov/21081558, https://doi.org/10.1093/nar/gkq1159';
1410 $citation_table = "psp_gene_site";
1411 add_citation($citation_table, $citation_text, "Phosida_pSTY_motifs");
1412 $citation_table = "psp_gene_site_view";
1413 add_citation($citation_table, $citation_text, "Phosida_pSTY_motifs");
1414
1415
1416 ###############################################################################################################################
1417 #
1418 # Read the data file:
1419 # 1) find sequences that match the NetworKIN predictions
1420 # 2) find motifs that match the observed sequences
1421 #
1422 ###############################################################################################################################
1423
1424 print "--- Find sequences that match the NetworKIN predictions and find motifs that match observed sequences\n";
1425
1426 my $ppep_regsite_LUT_stmth;
1427 $ppep_regsite_LUT_stmth = $dbh->prepare("
1428 INSERT INTO ppep_regsite_LUT (
1429 ppep_id,
1430 site_plusminus_7AA
1431 ) VALUES (?,?)
1432 ");
1433
1434 my ($start_seconds, $start_microseconds) = gettimeofday;
1435
1436 foreach my $peptide (keys %data) {
1437 # find the unique phospho-motifs for this $peptide
1438 my @all_motifs = ();
1439 my $have_all_motifs = 0;
1440 for my $i (0 .. $#{ $matched_sequences{$peptide} } ) {
1441 my $tmp_motif = $p_motifs{$peptide}[$i];
1442 push(@all_motifs, $tmp_motif);
1443 $have_all_motifs = 1;
1444 }
1445 if ($have_all_motifs == 1) {
1446 for my $j (0 .. $#all_motifs) {
1447 if (defined $all_motifs[$j]) {
1448 $all_motifs[$j] =~ s/\d+-\[\s//;
1449 $all_motifs[$j] =~ s/\s\]\-\d+//;
1450 }
1451 }
1452 }
1453 my %seen = ();
1454 if ($have_all_motifs == 1) {
1455 foreach my $a (@all_motifs) {
1456 if (defined $a) {
1457 if (exists($seen{$a})) {
1458 next;
1459 } else {
1460 push(@{$unique_motifs{$peptide}}, $a);
1461 $seen{$a} = 1;
1462 }
1463 }
1464 print "push(\@{\$unique_motifs{$peptide}}, $a);\n" if ($verbose);
1465 }
1466 }
1467
1468 # count the number of phospo-sites in the motif
1469 my $number_pY = 0;
1470 my $number_pSTY = 0;
1471 if ($phospho_type eq 'y') {
1472 if (defined(${$unique_motifs{$peptide}}[0])) {
1473 while (${$unique_motifs{$peptide}}[0] =~ /pY/g) {
1474 $number_pY++;
1475 }
1476 }
1477 }
1478 if ($phospho_type eq 'sty') {
1479 print "looking for unique_motifs for $peptide\n" if ($verbose);
1480 if (defined(${$unique_motifs{$peptide}}[0])) {
1481 while (${$unique_motifs{$peptide}}[0] =~ /(pS|pT|pY)/g) {
1482 $number_pSTY++;
1483 print "We have found $number_pSTY unique_motifs for $peptide\n" if ($verbose);
1484 }
1485 }
1486 }
1487
1488
1489 # search each of the unique motifs for matches
1490 print "searching $#{$unique_motifs{$peptide}} motifs for peptide $peptide\n" if ($verbose);
1491 for my $i (0 .. $#{$unique_motifs{$peptide}}) {
1492 print "\$i = $i; peptide = $peptide; unique_motif = ${$unique_motifs{$peptide}}[$i]\n" if ($verbose);
1493 my $tmp_motif = ${$unique_motifs{$peptide}}[$i];
1494 print " --- matching unique motif $tmp_motif for peptide $peptide at " . format_localtime_iso8601() ."\n" if ($verbose);
1495 my $formatted_sequence;
1496 if (($number_pY == 1) || ($number_pSTY == 1)) {
1497 my $seq_plus5aa = "";
1498 my $seq_plus7aa = "";
1499 $formatted_sequence = &replace_pSpTpY($tmp_motif, $phospho_type);
1500 print " a #pY $number_pY; #pSTY $number_pSTY; matching formatted motif $formatted_sequence for peptide $peptide at " . format_localtime_iso8601() ."\n" if ($verbose);
1501 if ($phospho_type eq 'y') {
1502 $seq_plus5aa = (split(/(\w{0,5}y\w{0,5})/, $formatted_sequence))[1];
1503 $seq_plus7aa = (split(/(\w{0,7}y\w{0,7})/, $formatted_sequence))[1];
1504 }
1505 elsif ($phospho_type eq "sty") {
1506 $seq_plus5aa = (split(/(\w{0,5}(s|t|y)\w{0,5})/, $formatted_sequence))[1];
1507 $seq_plus7aa = (split(/(\w{0,7}(s|t|y)\w{0,7})/, $formatted_sequence))[1];
1508 }
1509
1510 if (defined $seq_plus7aa) {
1511 # commit the 7aa LUT records
1512 $ppep_regsite_LUT_stmth->bind_param( 1, $ppep_id_lut{$peptide} );
1513 $ppep_regsite_LUT_stmth->bind_param( 2, $seq_plus7aa );
1514 if (not $ppep_regsite_LUT_stmth->execute()) {
1515 print "Error writing tuple ($ppep_id_lut{$peptide},$seq_plus7aa) for peptide $peptide to ppep_regsite_LUT: $ppep_regsite_LUT_stmth->errstr\n";
1516 }
1517 }
1518 for my $i (0 .. $#kinases_observed) {
1519 if (defined $seq_plus5aa) {
1520 my $tmp = $seq_plus5aa."_".$kinases_observed[$i]; #eg, should be PGRPLsSYGMD_PKCalpha
1521 if (exists($p_sequence_kinase -> {$tmp})) {
1522 $kinase_substrate_NetworKIN_matches{$peptide}{$kinases_observed[$i]} = "X"; #ACE
1523 }
1524 }
1525 }
1526 for my $i (0 .. $#motif_sequence) {
1527 print "matching $motif_sequence[$i]" if ($verbose);
1528 if ($peptide =~ /$motif_sequence[$i]/) {
1529 $kinase_motif_matches{$peptide}{$motif_type{$motif_type_key_ary[$i]}} = "X";
1530 }
1531 }
1532 for my $i (0 .. $#kinases_PhosphoSite) {
1533 if (defined $seq_plus7aa) {
1534 my $tmp = $seq_plus7aa."_".$kinases_PhosphoSite[$i]; #eg, should be RTPGRPLsSYGMDSR_PAK2
1535 if (exists($p_sequence_kinase_PhosphoSite -> {$tmp})) {
1536 $kinase_substrate_PhosphoSite_matches{$peptide}{$kinases_PhosphoSite[$i]} = "X";
1537 }
1538 }
1539 }
1540 if (exists($regulatory_sites_PhosphoSite_hash{$seq_plus7aa})) {
1541 $seq_plus7aa_2{$peptide} = $seq_plus7aa;
1542 $domain_2{$peptide} = $domain{$seq_plus7aa};
1543 $ON_FUNCTION_2{$peptide} = $ON_FUNCTION{$seq_plus7aa};
1544 $ON_PROCESS_2{$peptide} = $ON_PROCESS{$seq_plus7aa};
1545 $ON_PROT_INTERACT_2{$peptide} = $ON_PROT_INTERACT{$seq_plus7aa};
1546 $ON_OTHER_INTERACT_2{$peptide} = $ON_OTHER_INTERACT{$seq_plus7aa};
1547 $notes_2{$peptide} = $notes{$seq_plus7aa};
1548 $organism_2{$peptide} = $organism{$seq_plus7aa};
1549 } else {
1550 }
1551 }
1552 elsif (($number_pY > 1) || ($number_pSTY > 1)) { #eg, if $x[4] is 1308-[ VIYFQAIEEVpYpYDHLRSAAKKR ]-1329 and $number_pY == 2
1553 $formatted_sequence = $tmp_motif;
1554 $seq_plus5aa = "";
1555 $seq_plus7aa = "";
1556 #Create the sequences with only one phosphorylation site
1557 #eg, 1308-[ VIYFQAIEEVpYpYDHLRSAAKKR ]-1329, which becomes 1308-[ VIYFQAIEEVpYYDHLRSAAKKR ]-1329 and 1308-[ VIYFQAIEEVYpYDHLRSAAKKR ]-1329
1558
1559 my (@sites, $offset, $next_p_site);
1560 $sites[0] = index($tmp_motif, "p");
1561 $offset = $sites[0] + 1;
1562 $next_p_site = 0;
1563 while ($next_p_site != -1) {
1564 $next_p_site = index($tmp_motif, "p", $offset);
1565 if ($next_p_site != -1) {
1566 push (@sites, $next_p_site);
1567 }
1568 $offset = $next_p_site+1;
1569 }
1570
1571 my @pSTY_sequences;
1572 for my $n (0 .. $#sites) {
1573 $pSTY_sequences[$n] = $tmp_motif;
1574 for (my $m = $#sites; $m >= 0; $m--) {
1575 if ($m != $n) {substr($pSTY_sequences[$n], $sites[$m], 1) = "";}
1576 }
1577 }
1578
1579 my @formatted_sequences;
1580 for my $k (0 .. $#sites) {
1581 $formatted_sequences[$k] = &replace_pSpTpY($pSTY_sequences[$k], $phospho_type);
1582 }
1583
1584 for my $k (0 .. $#formatted_sequences) {
1585 print " b #pY $number_pY; #pSTY $number_pSTY; matching formatted motif $formatted_sequences[$k] for peptide $peptide at " . format_localtime_iso8601() ."\n" if ($verbose);
1586 if ($phospho_type eq 'y') {
1587 $seq_plus5aa = (split(/(\w{0,5}y\w{0,5})/, $formatted_sequences[$k]))[1];
1588 $seq_plus7aa = (split(/(\w{0,7}y\w{0,7})/, $formatted_sequences[$k]))[1];
1589 }
1590 elsif ($phospho_type eq "sty") {
1591 $seq_plus5aa = (split(/(\w{0,5}(s|t|y)\w{0,5})/, $formatted_sequences[$k]))[1];
1592 $seq_plus7aa = (split(/(\w{0,7}(s|t|y)\w{0,7})/, $formatted_sequences[$k]))[1];
1593 }
1594 for my $i (0 .. $#kinases_observed) {
1595 my $tmp = $seq_plus5aa."_".$kinases_observed[$i]; #eg, should look like REEILsEMKKV_PKCalpha
1596 if (exists($p_sequence_kinase -> {$tmp})) {
1597 $kinase_substrate_NetworKIN_matches{$peptide}{$kinases_observed[$i]} = "X";
1598 }
1599 }
1600 $pSTY_sequence = $formatted_sequences[$k];
1601 for my $i (0 .. $#motif_sequence) {
1602 if ($pSTY_sequence =~ /$motif_sequence[$i]/) {
1603 $kinase_motif_matches{$peptide}{$motif_type{$motif_type_key_ary[$i]}} = "X";
1604 }
1605 }
1606 for my $i (0 .. $#kinases_PhosphoSite) {
1607 my $tmp = $seq_plus7aa."_".$kinases_PhosphoSite[$i]; #eg, should be RTPGRPLsSYGMDSR_PAK2
1608 #print "seq_plus7aa._.kinases_PhosphoSite[i] is $tmp";
1609 if (exists($p_sequence_kinase_PhosphoSite -> {$tmp})) {
1610 $kinase_substrate_PhosphoSite_matches{$peptide}{$kinases_PhosphoSite[$i]} = "X";
1611 }
1612 }
1613 if (exists($regulatory_sites_PhosphoSite -> {$seq_plus7aa})) {
1614 $seq_plus7aa_2{$peptide} = $seq_plus7aa;
1615
1616 # $domain
1617 if ($domain_2{$peptide} eq "") {
1618 $domain_2{$peptide} = $domain{$seq_plus7aa};
1619 }
1620 elsif ($domain{$seq_plus7aa} eq "") {
1621 # do nothing
1622 }
1623 else {
1624 $domain_2{$peptide} = $domain_2{$peptide}." / ".$domain{$seq_plus7aa};
1625 }
1626
1627
1628 # $ON_FUNCTION_2
1629 if ($ON_FUNCTION_2{$peptide} eq "") {
1630 $ON_FUNCTION_2{$peptide} = $ON_FUNCTION{$seq_plus7aa};
1631 }
1632 elsif ($ON_FUNCTION{$seq_plus7aa} eq "") {
1633 # do nothing
1634 }
1635 else {
1636 $ON_FUNCTION_2{$peptide} = $ON_FUNCTION_2{$peptide}." / ".$ON_FUNCTION{$seq_plus7aa};
1637 }
1638
1639 # $ON_PROCESS_2
1640 if ($ON_PROCESS_2{$peptide} eq "") {
1641 $ON_PROCESS_2{$peptide} = $ON_PROCESS{$seq_plus7aa};
1642 }
1643 elsif ($ON_PROCESS{$seq_plus7aa} eq "") {
1644 # do nothing
1645 }
1646 else {
1647 $ON_PROCESS_2{$peptide} = $ON_PROCESS_2{$peptide}." / ".$ON_PROCESS{$seq_plus7aa};
1648 }
1649
1650 # $ON_PROT_INTERACT_2
1651 if ($ON_PROT_INTERACT_2{$peptide} eq "") {
1652 $ON_PROT_INTERACT_2{$peptide} = $ON_PROT_INTERACT{$seq_plus7aa};
1653 }
1654 elsif ($ON_PROT_INTERACT{$seq_plus7aa} eq "") {
1655 # do nothing
1656 }
1657 else {
1658 $ON_PROT_INTERACT_2{$peptide} = $ON_PROT_INTERACT_2{$peptide}." / ".$ON_PROT_INTERACT{$seq_plus7aa};
1659 }
1660
1661 # $ON_OTHER_INTERACT_2
1662 if ($ON_OTHER_INTERACT_2{$peptide} eq "") {
1663 $ON_OTHER_INTERACT_2{$peptide} = $ON_OTHER_INTERACT{$seq_plus7aa};
1664 }
1665 elsif ($ON_OTHER_INTERACT{$seq_plus7aa} eq "") {
1666 # do nothing
1667 }
1668 else {
1669 $ON_OTHER_INTERACT_2{$peptide} = $ON_OTHER_INTERACT_2{$peptide}." / ".$ON_OTHER_INTERACT{$seq_plus7aa};
1670 }
1671
1672 # $notes_2
1673 if ($notes_2{$peptide} eq "") {
1674 $notes_2{$peptide} = $notes{$seq_plus7aa};
1675 }
1676 elsif ($notes{$seq_plus7aa} eq "") {
1677 # do nothing
1678 }
1679 else {
1680 $notes_2{$peptide} = $notes_2{$peptide}." / ".$notes{$seq_plus7aa};
1681 }
1682 $notes_2{$peptide} = $notes{$seq_plus7aa};
1683
1684 # $organism_2
1685 if ($organism_2{$peptide} eq "") {
1686 $organism_2{$peptide} = $organism{$seq_plus7aa};
1687 }
1688 elsif ($organism{$seq_plus7aa} eq "") {
1689 # do nothing
1690 }
1691 else {
1692 $organism_2{$peptide} = $organism_2{$peptide}." / ".$organism{$seq_plus7aa};
1693 }
1694 $organism_2{$peptide} = $organism{$seq_plus7aa};
1695 } else {
1696 } # if (exists($regulatory_sites_PhosphoSite -> {$seq_plus7aa}))
1697 } # for my $k (0 .. $#formatted_sequences)
1698 } # if/else number of phosphosites
1699 } # for each motif i # for my $i (0 .. $#{$unique_motifs{$peptide}})
1700 } # for each $peptide
1701
1702 my ($end_seconds, $end_microseconds) = gettimeofday;
1703
1704 my $delta_seconds = $end_seconds - $start_seconds;
1705 my $delta_microseconds = $end_microseconds - $start_microseconds;
1706 $delta_microseconds += 1000000 * $delta_seconds;
1707 my $key_count = keys(%data);
1708 print sprintf("Average search time is %d microseconds per phopshopeptide\n", ($delta_microseconds / $key_count));
1709
1710 ($start_seconds, $start_microseconds) = gettimeofday;
1711
1712 print "Writing PSP_Regulatory_site records\n";
1713
1714 my $psp_regulatory_site_stmth = $dbh->prepare("
1715 INSERT INTO PSP_Regulatory_site (
1716 DOMAIN,
1717 ON_FUNCTION,
1718 ON_PROCESS,
1719 ON_PROT_INTERACT,
1720 ON_OTHER_INTERACT,
1721 NOTES,
1722 SITE_PLUSMINUS_7AA,
1723 ORGANISM
1724 ) VALUES (?,?,?,?,?,?,?,?)
1725 ");
1726
1727 foreach my $peptide (keys %data) {
1728 if (exists($domain_2{$peptide}) and (defined $domain_2{$peptide}) and (not $domain_2{$peptide} eq "") ) {
1729 $psp_regulatory_site_stmth->bind_param(1, $domain_2{$peptide});
1730 $psp_regulatory_site_stmth->bind_param(2, $ON_FUNCTION_2{$peptide});
1731 $psp_regulatory_site_stmth->bind_param(3, $ON_PROCESS_2{$peptide});
1732 $psp_regulatory_site_stmth->bind_param(4, $ON_PROT_INTERACT_2{$peptide});
1733 $psp_regulatory_site_stmth->bind_param(5, $ON_OTHER_INTERACT_2{$peptide});
1734 $psp_regulatory_site_stmth->bind_param(6, $notes_2{$peptide});
1735 $psp_regulatory_site_stmth->bind_param(7, $seq_plus7aa_2{$peptide});
1736 $psp_regulatory_site_stmth->bind_param(8, $organism_2{$peptide});
1737 if (not $psp_regulatory_site_stmth->execute()) {
1738 print "Error writing PSP_Regulatory_site for one regulatory site with peptide '$domain_2{$peptide}': $psp_regulatory_site_stmth->errstr\n";
1739 } else {
1740 }
1741 } elsif (exists($domain_2{$peptide}) and (not defined $domain_2{$peptide})) {
1742 print "\$domain_2{$peptide} is undefined\n"; #ACE
1743 }
1744 }
1745
1746 $dbh->{AutoCommit} = $auto_commit;
1747 # auto_commit implicitly finishes psp_regulatory_site_stmth, apparently # $psp_regulatory_site_stmth->finish;
1748 $dbh->disconnect if ( defined $dbh );
1749
1750
1751 ($end_seconds, $end_microseconds) = gettimeofday;
1752
1753 $delta_seconds = $end_seconds - $start_seconds;
1754 $delta_microseconds = $end_microseconds - $start_microseconds;
1755 $delta_microseconds += 1000000 * $delta_seconds;
1756 $key_count = keys(%data);
1757 print sprintf("Write time is %d microseconds\n", ($delta_microseconds));
1758
1759 print "... Finished find sequences that match the NetworKIN predictions and find motifs that match observed sequences at " . format_localtime_iso8601() ."\n\n";
1760
1761 ###############################################################################################################################
1762 #
1763 # Print to the output file
1764 #
1765 ###############################################################################################################################
1766
1767
1768 open (OUT, ">$file_out") || die "could not open the fileout: $file_out";
1769 open (MELT, ">$file_melt") || die "could not open the fileout: $file_melt";
1770
1771 # print the header info
1772 print MELT "phospho_peptide\tgene_names\tsite_type\tkinase_map\n";
1773 print OUT "p-peptide\tProtein description\tGene name(s)\tFASTA name\tPhospho-sites\tUnique phospho-motifs, no residue numbers\tAccessions\tPhospho-motifs for all members of protein group with residue numbers\t";
1774
1775 # print the PhosphoSite regulatory data
1776 print OUT "Domain\tON_FUNCTION\tON_PROCESS\tON_PROT_INTERACT\tON_OTHER_INTERACT\tPhosphoSite notes\t";
1777
1778 # print the sample names
1779 for my $i (0 .. $#samples) { print OUT "$samples[$i]\t"; }
1780
1781 # print the kinases and groups
1782 for my $i (0 .. $#kinases_observed) {
1783 my $temp = $kinases_observed[$i]."_NetworKIN";
1784 print OUT "$temp\t";
1785 push(@kinases_observed_lbl, $temp);
1786 }
1787 my @motif_type_keys = keys %motif_type;
1788 for my $i (1 .. $#motif_type_keys) {
1789 print OUT "$motif_type{$motif_type_keys[$i]}\t";
1790 }
1791 for my $i (0 .. $#kinases_PhosphoSite) {
1792 my $temp = $kinases_PhosphoSite[$i]; # ."_PhosphoSite";
1793 if ($i < $#kinases_PhosphoSite) { print OUT "$temp\t"; }
1794 if ($i == $#kinases_PhosphoSite) { print OUT "$temp\n"; }
1795 push(@phosphosites_observed_lbl, $temp);
1796 }
1797
1798 # begin DDL-to-SQLite
1799 # ---
1800 $dbh = DBI->connect("dbi:SQLite:$db_out", undef, undef);
1801 $auto_commit = $dbh->{AutoCommit};
1802 $dbh->{AutoCommit} = 0;
1803 print "DB connection $dbh is to $db_out, opened for modification\n";
1804
1805 my $sample_stmth;
1806 $sample_stmth = $dbh->prepare("
1807 INSERT INTO sample (
1808 id,
1809 name
1810 ) VALUES (?,?)
1811 ");
1812
1813 my $ppep_intensity_stmth;
1814 $ppep_intensity_stmth = $dbh->prepare("
1815 INSERT INTO ppep_intensity (
1816 ppep_id,
1817 sample_id,
1818 intensity
1819 ) VALUES (?,?,?)
1820 ");
1821
1822 my $site_type_stmth;
1823 $site_type_stmth = $dbh->prepare("
1824 insert into site_type (
1825 id,
1826 type_name
1827 ) values (?,?)
1828 ");
1829
1830 my $ppep_gene_site_stmth;
1831 $ppep_gene_site_stmth = $dbh->prepare("
1832 insert into ppep_gene_site (
1833 ppep_id,
1834 gene_names,
1835 kinase_map,
1836 site_type_id
1837 ) values (?,?,?,?)
1838 ");
1839
1840 my $ppep_metadata_stmth;
1841 $ppep_metadata_stmth = $dbh->prepare("
1842 INSERT INTO ppep_metadata
1843 ( ppep_id
1844 , protein_description
1845 , gene_name
1846 , FASTA_name
1847 , phospho_sites
1848 , motifs_unique
1849 , accessions
1850 , motifs_all_members
1851 , domain
1852 , ON_FUNCTION
1853 , ON_PROCESS
1854 , ON_PROT_INTERACT
1855 , ON_OTHER_INTERACT
1856 , notes
1857 ) VALUES (
1858 ?,?,?,?,?,?,?
1859 , ?,?,?,?,?,?,?
1860 )
1861 ");
1862 # end DDL-to-SQLite
1863 # ...
1864
1865 # begin store-to-SQLite "sample" table
1866 # ---
1867 # %sample_id_lut maps name -> ID
1868 for my $sample_name (keys %sample_id_lut) {
1869 $sample_stmth->bind_param( 2, $sample_name );
1870 $sample_stmth->bind_param( 1, $sample_id_lut{$sample_name} );
1871 if (not $sample_stmth->execute()) {
1872 print "Error writing tuple ($sample_name,$sample_id_lut{$sample_name}): $sample_stmth->errstr\n";
1873 }
1874 }
1875 # end store-to-SQLite "sample" table
1876 # ...
1877
1878 # begin store-to-SQLite "site_type" table
1879 # ---
1880 sub add_site_type {
1881 my ($site_type_id, $site_type_type_name) = @_;
1882 $site_type_stmth->bind_param( 2, $site_type_type_name );
1883 $site_type_stmth->bind_param( 1, $site_type_id );
1884 if (not $site_type_stmth->execute()) {
1885 die "Error writing tuple ($site_type_id,$site_type_type_name): $site_type_stmth->errstr\n";
1886 }
1887 }
1888 add_site_type($SITE_KINASE_SUBSTRATE, $site_description{$SITE_KINASE_SUBSTRATE});
1889 add_site_type($SITE_HPRD , $site_description{$SITE_HPRD });
1890 add_site_type($SITE_PHOSIDA , $site_description{$SITE_PHOSIDA });
1891 add_site_type($SITE_PHOSPHOSITE , $site_description{$SITE_PHOSPHOSITE });
1892 # end store-to-SQLite "site_type" table
1893 # ...
1894
1895 foreach my $peptide (sort(keys %data)) {
1896 next if (grep($peptide, @failed_matches));
1897 my $ppep_id = $ppep_id_lut{$peptide};
1898 my @ppep_metadata = ();
1899 my @ppep_intensity = ();
1900 my @gene = ();
1901 my $gene_names;
1902 my $j;
1903 # Print the peptide itself
1904 # column 1: p-peptide
1905 print OUT "$peptide\t";
1906 push (@ppep_metadata, $ppep_id);
1907 push (@ppep_intensity, $peptide);
1908
1909 my $verbose_cond = 0; # $peptide eq 'AAAAAAAGDpSDpSWDADAFSVEDPVR' || $peptide eq 'KKGGpSpSDEGPEPEAEEpSDLDSGSVHSASGRPDGPVR';
1910 # skip over failed matches
1911 print "\nfirst match for '$peptide' is '$matched_sequences{$peptide}[0]' and FAILED_MATCH_SEQ is '$FAILED_MATCH_SEQ'\n" if $verbose_cond;
1912 if ($matched_sequences{$peptide}[0] eq $FAILED_MATCH_SEQ) {
1913 # column 2: Protein description
1914 # column 3: Gene name(s)
1915 # column 4: FASTA name
1916 # column 5: phospho-residues
1917 # Column 6: UNIQUE phospho-motifs
1918 # Column 7: accessions
1919 # Column 8: ALL motifs with residue numbers
1920 # 2 3 4 5 6 7 8
1921 print OUT "Sequence not found in FASTA database\tNA\tNA\tNA\tNA\tNA\tNA\t";
1922 print "No match found for '$peptide' in sequence database\n";
1923 $gene_names = '$FAILED_MATCH_GENE_NAME';
1924 } else {
1925 my @description = ();
1926 my %seen = ();
1927 # Print just the protein description
1928 for $i (0 .. $#{$names{$peptide}}) {
1929 my $long_name = $names{$peptide}[$i];
1930 my @naming_parts = split(/\sOS/, $long_name);
1931 my @front_half = split(/\s/, $naming_parts[0]);
1932 push(@description, join(" ", @front_half[1..($#front_half)]));
1933 }
1934 # column 2: Protein description
1935 print OUT join(" /// ", @description), "\t";
1936 push (@ppep_metadata, join(" /// ", @description));
1937
1938 # Print just the gene name
1939 for $i (0 .. $#{$names{$peptide}}) {
1940 my $tmp_gene = $names{$peptide}[$i];
1941 $tmp_gene =~ s/^.*GN=//;
1942 $tmp_gene =~ s/\s.*//;
1943 if (!exists($seen{$tmp_gene})) {
1944 push(@gene, $tmp_gene);
1945 $seen{$tmp_gene} = $tmp_gene;
1946 }
1947 }
1948 # column 3: Gene name(s)
1949 $gene_names = join(" /// ", @gene);
1950 print OUT $gene_names, "\t";
1951 push (@ppep_metadata, join(" /// ", @gene));
1952
1953 # column 4: FASTA name
1954 print OUT join(" /// ", @{$names{$peptide}}), "\t";
1955 push (@ppep_metadata, join(" /// ", @{$names{$peptide}}));
1956
1957 # column 5: phospho-residues
1958 my $tmp_for_insert = "";
1959 my $foobar;
1960 for my $i (0 .. $#{ $matched_sequences{$peptide} } ) {
1961 print "match $i for '$peptide' is '$matched_sequences{$peptide}[$i]'\n" if $verbose_cond;
1962 if ($i < $#{ $matched_sequences{$peptide} }) {
1963 if (defined $p_residues{$peptide}{$i}) {
1964 @tmp_p_residues = @{$p_residues{$peptide}{$i}};
1965 for $j (0 .. $#tmp_p_residues) {
1966 if ($j < $#tmp_p_residues) {
1967 my $tmp_site_for_printing = $p_residues{$peptide}{$i}[$j] + 1; # added 12.05.2012 for Justin's data
1968 print OUT "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing, ";
1969 $tmp_for_insert .= "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing, ";
1970 }
1971 elsif ($j == $#tmp_p_residues) {
1972 my $tmp_site_for_printing = $p_residues{$peptide}{$i}[$j] + 1; # added 12.05.2012 for Justin's data
1973 print OUT "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing /// ";
1974 $tmp_for_insert .= "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing /// ";
1975 }
1976 }
1977 }
1978 }
1979 elsif ($i == $#{ $matched_sequences{$peptide} }) {
1980 my $there_were_sites = 0;
1981 if (defined $p_residues{$peptide}{$i}) {
1982 @tmp_p_residues = @{$p_residues{$peptide}{$i}};
1983 if ($#tmp_p_residues > 0) {
1984 for my $j (0 .. $#tmp_p_residues) {
1985 if ($j < $#tmp_p_residues) {
1986 if (defined $p_residues{$peptide}{$i}[$j]) {
1987 my $tmp_site_for_printing = $p_residues{$peptide}{$i}[$j] + 1; # added 12.05.2012 for Justin's data
1988 $foobar = $residues{$peptide}{$i}[$j];
1989 if (defined $foobar) {
1990 print OUT "$foobar";
1991 print OUT "$tmp_site_for_printing, ";
1992 $tmp_for_insert .= "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing, ";
1993 $there_were_sites = 1;
1994 }
1995 }
1996 }
1997 elsif ($j == $#tmp_p_residues) {
1998 if (defined $p_residues{$peptide}{$i}[$j]) {
1999 $foobar = $residues{$peptide}{$i}[$j];
2000 if (defined $foobar) {
2001 my $tmp_site_for_printing = $p_residues{$peptide}{$i}[$j] + 1; # added 12.05.2012 for Justin's data
2002 print OUT "$foobar";
2003 print OUT "$tmp_site_for_printing\t";
2004 $tmp_for_insert .= "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing";
2005 $there_were_sites = 1;
2006 }
2007 }
2008 }
2009 }
2010 }
2011 }
2012 if (0 == $there_were_sites) {
2013 print OUT "\t";
2014 }
2015 }
2016 }
2017 print "tmp_for_insert '$tmp_for_insert' for '$peptide'\n" if $verbose_cond;
2018 push (@ppep_metadata, $tmp_for_insert);
2019
2020 # Column 6: UNIQUE phospho-motifs
2021 print OUT join(" /// ", @{$unique_motifs{$peptide}}), "\t";
2022 push (@ppep_metadata, join(" /// ", @{$unique_motifs{$peptide}}));
2023
2024 # Column 7: accessions
2025 if (defined $accessions{$peptide}) {
2026 print OUT join(" /// ", @{$accessions{$peptide}}), "\t";
2027 push (@ppep_metadata, join(" /// ", @{$accessions{$peptide}}));
2028 } else {
2029 print OUT "\t";
2030 push (@ppep_metadata, "");
2031 }
2032
2033 # Column 8: ALL motifs with residue numbers
2034 if (defined $p_motifs{$peptide}) {
2035 print OUT join(" /// ", @{$p_motifs{$peptide}}), "\t";
2036 push (@ppep_metadata, join(" /// ", @{$p_motifs{$peptide}}));
2037 } else {
2038 print OUT "\t";
2039 push (@ppep_metadata, "");
2040 }
2041
2042 }
2043
2044 # Print the PhosphoSite regulatory data
2045
2046 if (defined $domain_2{$peptide}) { print OUT "$domain_2{$peptide}\t"; } else { print OUT "\t"; }
2047 if (defined $ON_FUNCTION_2{$peptide}) { print OUT "$ON_FUNCTION_2{$peptide}\t"; } else { print OUT "\t"; }
2048 if (defined $ON_PROCESS_2{$peptide}) { print OUT "$ON_PROCESS_2{$peptide}\t"; } else { print OUT "\t"; }
2049 if (defined $ON_PROT_INTERACT_2{$peptide}) { print OUT "$ON_PROT_INTERACT_2{$peptide}\t"; } else { print OUT "\t"; }
2050 if (defined $ON_OTHER_INTERACT_2{$peptide}) { print OUT "$ON_OTHER_INTERACT_2{$peptide}\t"; } else { print OUT "\t"; }
2051 if (defined $notes_2{$peptide}) { print OUT "$notes_2{$peptide}\t"; } else { print OUT "\t"; }
2052
2053 if (defined $domain_2{$peptide}) { push (@ppep_metadata, $domain_2{$peptide}); } else { push(@ppep_metadata, ""); }
2054 if (defined $ON_FUNCTION_2{$peptide}) { push (@ppep_metadata, $ON_FUNCTION_2{$peptide}); } else { push(@ppep_metadata, ""); }
2055 if (defined $ON_PROCESS_2{$peptide}) { push (@ppep_metadata, $ON_PROCESS_2{$peptide}); } else { push(@ppep_metadata, ""); }
2056 if (defined $ON_PROT_INTERACT_2{$peptide}) { push (@ppep_metadata, $ON_PROT_INTERACT_2{$peptide}); } else { push(@ppep_metadata, ""); }
2057 if (defined $ON_OTHER_INTERACT_2{$peptide}) { push (@ppep_metadata, $ON_OTHER_INTERACT_2{$peptide}); } else { push(@ppep_metadata, ""); }
2058 if (defined $notes_2{$peptide}) { push (@ppep_metadata, $notes_2{$peptide}); } else { push(@ppep_metadata, ""); }
2059
2060 # begin store-to-SQLite "ppep_metadata" table
2061 # ---
2062 for $i (1..14) {
2063 $ppep_metadata_stmth->bind_param($i, $ppep_metadata[$i-1]);
2064 }
2065 if (not $ppep_metadata_stmth->execute()) {
2066 print "Error writing ppep_metadata row for phosphopeptide $ppep_metadata[$i]: $ppep_metadata_stmth->errstr\n";
2067 }
2068 # ...
2069 # end store-to-SQLite "ppep_metadata" table
2070
2071 # Print the data
2072 @tmp_data = ();
2073 foreach (@{$data{$peptide}}) {
2074 push(@tmp_data, $_);
2075 }
2076 print OUT join("\t", @tmp_data), "\t";
2077
2078 # begin store-to-SQLite "ppep_intensity" table
2079 # ---
2080 # commit the sample intensities
2081 $i = 0;
2082 foreach (@{$data{$peptide}}) {
2083 my $intense = $_;
2084 $ppep_intensity_stmth->bind_param( 1, $ppep_id );
2085 $ppep_intensity_stmth->bind_param( 2, $sample_id_lut{$samples[$i]} );
2086 $ppep_intensity_stmth->bind_param( 3, $intense );
2087 if (not $ppep_intensity_stmth->execute()) {
2088 print "Error writing tuple ($peptide,$samples[$i],$intense): $ppep_intensity_stmth->errstr\n";
2089 }
2090 $i += 1;
2091 }
2092 # ...
2093 # end store-to-SQLite "ppep_intensity" table
2094
2095 # print the kinase-substrate data
2096 for my $i (0 .. $#kinases_observed) {
2097 if (exists($kinase_substrate_NetworKIN_matches{$peptide}{$kinases_observed[$i]})) {
2098 print OUT "X\t";
2099 my $NetworKIN_label = $kinases_observed[$i]; #."_NetworKIN";
2100 print MELT "$peptide\t$gene_names\t$site_description{$SITE_KINASE_SUBSTRATE}\t$NetworKIN_label\n";
2101 # begin store-to-SQLite "ppep_gene_site" table
2102 # ---
2103 $ppep_gene_site_stmth->bind_param(1, $ppep_id); # ppep_gene_site.ppep_id
2104 $ppep_gene_site_stmth->bind_param(2, $gene_names); # ppep_gene_site.gene_names
2105 $ppep_gene_site_stmth->bind_param(3, $NetworKIN_label); # ppep_gene_site.kinase_map
2106 $ppep_gene_site_stmth->bind_param(4, $SITE_KINASE_SUBSTRATE); # ppep_gene_site.site_type_id
2107 if (not $ppep_gene_site_stmth->execute()) {
2108 print "Error writing tuple ($peptide,$gene_names,$kinases_observed[$i]): $ppep_gene_site_stmth->errstr\n";
2109 }
2110 # ...
2111 # end store-to-SQLite "ppep_gene_site" table
2112 }
2113 else { print OUT "\t";}
2114 }
2115 my %wrote_motif;
2116 my $motif_parts_0;
2117 my @motif_split;
2118 my $one_motif;
2119
2120 for my $i (0 .. $#motif_type_keys) {
2121 if (exists($kinase_motif_matches{$peptide}{$motif_type_keys[$i]})) {
2122 print OUT "X\t";
2123 #ACE-2022.06.20 $motif_parts_0 = $motif_type{$motif_sequence[$i]}." ".$motif_sequence[$i];
2124 $motif_parts_0 = $motif_type{$motif_type_keys[$i]};
2125 @motif_split = split("[|]", $motif_parts_0);
2126 #ACE-2022.06.20 my $key = "$peptide\t$gene_names\t$motif_parts_0";
2127 for my $j (0 .. $#motif_split) {
2128 $one_motif = $motif_split[$j];
2129 #ACE-2022.06.20 my $key = "$peptide\t$gene_names\t$motif_parts_0";
2130 my $key = "$peptide\t$gene_names\t$one_motif";
2131 if (!exists($wrote_motif{$key})) {
2132 $wrote_motif{$key} = $key;
2133 print MELT "$peptide\t$gene_names\t$motif_description[$i]\t$one_motif\n";
2134 # print "Line 657: i is $i\t$kinase_motif_matches{$peptide}{$motif_sequence[$i]}\n"; #debug
2135 # begin store-to-SQLite "ppep_gene_site" table
2136 # ---
2137 $ppep_gene_site_stmth->bind_param(1, $ppep_id); # ppep_gene_site.ppep_id
2138 $ppep_gene_site_stmth->bind_param(2, $gene_names); # ppep_gene_site.gene_names
2139 $ppep_gene_site_stmth->bind_param(3, $one_motif); # ppep_gene_site.kinase_map
2140 $ppep_gene_site_stmth->bind_param(4, $site_id{$motif_description[$i]}); # ppep_gene_site.site_type_id
2141 if (not $ppep_gene_site_stmth->execute()) {
2142 print "Error writing tuple ($peptide,$gene_names,$one_motif): $ppep_gene_site_stmth->errstr\n";
2143 }
2144 # ...
2145 # end store-to-SQLite "ppep_gene_site" table
2146 }
2147 }
2148 }
2149 else { print OUT "\t";}
2150 }
2151 for my $i (0 .. $#kinases_PhosphoSite) {
2152 if (exists($kinase_substrate_PhosphoSite_matches{$peptide}{$kinases_PhosphoSite[$i]})) {
2153 print MELT "$peptide\t$gene_names\t$site_description{$SITE_PHOSPHOSITE}\t$phosphosites_observed_lbl[$i]\n";
2154 if ($i < $#kinases_PhosphoSite) {
2155 print OUT "X\t";
2156 }
2157 else {
2158 print OUT "X\n";
2159 }
2160 # begin store-to-SQLite "ppep_gene_site" table
2161 # ---
2162 $ppep_gene_site_stmth->bind_param(1, $ppep_id); # ppep_gene_site.ppep_id
2163 $ppep_gene_site_stmth->bind_param(2, $gene_names); # ppep_gene_site.gene_names
2164 $ppep_gene_site_stmth->bind_param(3, $phosphosites_observed_lbl[$i]); # ppep_gene_site.kinase_map
2165 $ppep_gene_site_stmth->bind_param(4, $SITE_PHOSPHOSITE); # ppep_gene_site.site_type_id
2166 if (not $ppep_gene_site_stmth->execute()) {
2167 print "Error writing tuple ($peptide,$gene_names,$phosphosites_observed_lbl[$i]): $ppep_gene_site_stmth->errstr\n";
2168 }
2169 # ...
2170 # end store-to-SQLite "ppep_gene_site" table
2171 }
2172 else {
2173 if ($i < $#kinases_PhosphoSite) {
2174 print OUT "\t";
2175 }
2176 elsif ($i == $#kinases_PhosphoSite) {
2177 print OUT "\n";
2178 }
2179 }
2180 }
2181 }
2182
2183 close OUT;
2184 close MELT;
2185 $ppep_gene_site_stmth->finish;
2186 print "begin DB commit at " . format_localtime_iso8601() . "\n";
2187 $dbh->{AutoCommit} = $auto_commit;
2188 $dbh->disconnect if ( defined $dbh );
2189
2190 print "\nFinished writing output at " . format_localtime_iso8601() ."\n\n";
2191
2192 ###############################################################################################################################