Mercurial > repos > rnateam > graphprot_predict_profile

diff gplib.py @ 3:ace92c9a4653 draft
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/rna_tools/graphprot commit efcac98677c3ea9039c1c61eaa9e58f78287ccb3"
author: bgruening
date: Wed, 27 Jan 2021 19:27:47 +0000
parents: 20429f4c1b95
children: ddcf35a868b8
--- a/gplib.py	Mon Jan 27 18:37:05 2020 -0500
+++ b/gplib.py	Wed Jan 27 19:27:47 2021 +0000
@@ -1,13 +1,11 @@
 
-from distutils.spawn import find_executable
-import subprocess
-import statistics
+import gzip
 import random
-import gzip
-import uuid
-import sys
 import re
-import os
+import statistics
+import subprocess
+from distutils.spawn import find_executable
+
 
 """
 
@@ -19,11 +17,11 @@
 """
 
 
-################################################################################
+###############################################################################
 
 def graphprot_predictions_get_median(predictions_file):
     """
-    Given a GraphProt .predictions file, read in site scores and return 
+    Given a GraphProt .predictions file, read in site scores and return
     the median value.
 
     >>> test_file = "test-data/test.predictions"
@@ -43,29 +41,29 @@
     return statistics.median(sc_list)
 
 
-################################################################################
+###############################################################################
 
-def graphprot_profile_get_top_scores_median(profile_file,
-                                            profile_type="profile",
-                                            avg_profile_extlr=5):
+def graphprot_profile_get_tsm(profile_file,
+                              profile_type="profile",
+                              avg_profile_extlr=5):
 
     """
-    Given a GraphProt .profile file, extract for each site (identified by 
-    column 1 ID) the top (= highest) score. Then return the median of these 
+    Given a GraphProt .profile file, extract for each site (identified by
+    column 1 ID) the top (= highest) score. Then return the median of these
     top scores.
-    
+
     profile_type can be either "profile" or "avg_profile".
-    "avg_profile means that the position-wise scores will first get smoothed 
-    out by calculating for each position a new score through taking a 
-    sequence window -avg_profile_extlr to +avg_profile_extlr of the position 
-    and calculate the mean score over this window and assign it to the position.
-    After that, the maximum score of each site is chosen, and the median over 
-    all maximum scores is returned.
-    "profile" leaves the position-wise scores as they are, directly extracting 
+    "avg_profile means that the position-wise scores will first get smoothed
+    out by calculating for each position a new score through taking a
+    sequence window -avg_profile_extlr to +avg_profile_extlr of the position
+    and calculate the mean score over this window and assign it to the
+    position. After that, the maximum score of each site is chosen, and the
+    median over all maximum scores is returned.
+    "profile" leaves the position-wise scores as they are, directly extracting
     the maximum for each site and then reporting the median.
-    
+
     >>> test_file = "test-data/test.profile"
-    >>> graphprot_profile_get_top_scores_median(test_file)
+    >>> graphprot_profile_get_tsm(test_file)
     3.2
 
     """
@@ -90,25 +88,27 @@
             max_list.append(max_sc)
         elif profile_type == "avg_profile":
             # Convert profile score list to average profile scores list.
-            aps_list = list_moving_window_average_values(lists_dic[seq_id],
-                                                         win_extlr=avg_profile_extlr)
+            aps_list = \
+                list_moving_window_average_values(lists_dic[seq_id],
+                                                  win_extlr=avg_profile_extlr)
             max_sc = max(aps_list)
             max_list.append(max_sc)
         else:
-            assert 0, "invalid profile_type argument given: \"%s\"" %(profile_type)
+            assert 0, "invalid profile_type argument given: \"%s\"" \
+                % (profile_type)
     # Return the median.
     return statistics.median(max_list)
 
 
-################################################################################
+###############################################################################
 
-def list_moving_window_average_values(in_list, 
+def list_moving_window_average_values(in_list,
                                       win_extlr=5,
                                       method=1):
     """
-    Take a list of numeric values, and calculate for each position a new value, 
-    by taking the mean value of the window of positions -win_extlr and 
-    +win_extlr. If full extension is not possible (at list ends), it just 
+    Take a list of numeric values, and calculate for each position a new value,
+    by taking the mean value of the window of positions -win_extlr and
+    +win_extlr. If full extension is not possible (at list ends), it just
     takes what it gets.
     Two implementations of the task are given, chose by method=1 or method=2.
 
@@ -142,17 +142,17 @@
                 s = 0
             if e > l_list:
                 e = l_list
-            l = e-s
+            ln = e - s
             sc_sum = 0
-            for j in range(l):
-                sc_sum += in_list[s+j]
-            new_list[i] = sc_sum / l
+            for j in range(ln):
+                sc_sum += in_list[s + j]
+            new_list[i] = sc_sum / ln
     else:
-        assert 0, "invalid method ID given (%i)" %(method)
+        assert 0, "invalid method ID given (%i)" % (method)
     return new_list
 
 
-################################################################################
+###############################################################################
 
 def echo_add_to_file(echo_string, out_file):
     """
@@ -164,17 +164,17 @@
     error = False
     if output:
         error = True
-    assert error == False, "echo is complaining:\n%s\n%s" %(check_cmd, output)
+    assert not error, "echo is complaining:\n%s\n%s" % (check_cmd, output)
 
 
-################################################################################
+###############################################################################
 
 def is_tool(name):
     """Check whether tool "name" is in PATH."""
     return find_executable(name) is not None
 
 
-################################################################################
+###############################################################################
 
 def count_fasta_headers(fasta_file):
     """
@@ -194,7 +194,7 @@
     return row_count
 
 
-################################################################################
+###############################################################################
 
 def make_file_copy(in_file, out_file):
     """
@@ -202,17 +202,20 @@
 
     """
     check_cmd = "cat " + in_file + " > " + out_file
-    assert in_file != out_file, "cat does not like to cat file into same file (%s)" %(check_cmd)
+    assert in_file != out_file, \
+        "cat does not like to cat file into same file (%s)" % (check_cmd)
     output = subprocess.getoutput(check_cmd)
     error = False
     if output:
         error = True
-    assert error == False, "cat did not like your input (in_file: %s, out_file: %s):\n%s" %(in_file, out_file, output)
+    assert not error, \
+        "cat did not like your input (in_file: %s, out_file: %s):\n%s" \
+        % (in_file, out_file, output)
 
 
-################################################################################
+###############################################################################
 
-def split_fasta_into_test_train_files(in_fasta, test_out_fa, train_out_fa, 
+def split_fasta_into_test_train_files(in_fasta, test_out_fa, train_out_fa,
                                       test_size=500):
     """
     Split in_fasta .fa file into two files (e.g. test, train).
@@ -236,19 +239,48 @@
     TRAINOUT.close()
 
 
-################################################################################
+###############################################################################
+
+def check_seqs_dic_format(seqs_dic):
+    """
+    Check sequence dictionary for lowercase-only sequences or sequences
+    wich have lowercase nts in between uppercase nts.
+    Return suspicious IDs as list or empty list if not hits.
+    IDs with lowercase-only sequences.
+
+    >>> seqs_dic = {"id1" : "acguACGU", "id2" : "acgua", "id3" : "acgUUaUcc"}
+    >>> check_seqs_dic_format(seqs_dic)
+    ['id2', 'id3']
+    >>> seqs_dic = {"id1" : "acgAUaa", "id2" : "ACGUACUA"}
+    >>> check_seqs_dic_format(seqs_dic)
+    []
+
+    """
+    assert seqs_dic, "given seqs_dic empty"
+    bad_seq_ids = []
+    for seq_id in seqs_dic:
+        seq = seqs_dic[seq_id]
+        if re.search("^[acgtun]+$", seq):
+            bad_seq_ids.append(seq_id)
+        if re.search("[ACGTUN][acgtun]+[ACGTUN]", seq):
+            bad_seq_ids.append(seq_id)
+    return bad_seq_ids
+
+
+###############################################################################
 
 def read_fasta_into_dic(fasta_file,
                         seqs_dic=False,
                         ids_dic=False,
                         read_dna=False,
+                        short_ensembl=False,
                         reject_lc=False,
                         convert_to_uc=False,
                         skip_n_seqs=True):
     """
-    Read in FASTA sequences, convert to RNA, store in dictionary 
+    Read in FASTA sequences, convert to RNA, store in dictionary
     and return dictionary.
-    
+
     >>> test_fasta = "test-data/test.fa"
     >>> read_fasta_into_dic(test_fasta)
     {'seq1': 'acguACGUacgu', 'seq2': 'ugcaUGCAugcaACGUacgu'}
@@ -256,8 +288,11 @@
     >>> read_fasta_into_dic(test_fasta)
     {}
     >>> test_fasta = "test-data/test.ensembl.fa"
-    >>> read_fasta_into_dic(test_fasta, read_dna=True)
+    >>> read_fasta_into_dic(test_fasta, read_dna=True, short_ensembl=True)
     {'ENST00000415118': 'GAAATAGT', 'ENST00000448914': 'ACTGGGGGATACGAAAA'}
+    >>> test_fasta = "test-data/test4.fa"
+    >>> read_fasta_into_dic(test_fasta)
+    {'1': 'gccuAUGUuuua', '2': 'cugaAACUaugu'}
 
     """
     if not seqs_dic:
@@ -266,9 +301,9 @@
     seq = ""
 
     # Go through FASTA file, extract sequences.
-    if re.search(".+\.gz$", fasta_file):
+    if re.search(r".+\.gz$", fasta_file):
         f = gzip.open(fasta_file, 'rt')
-    else: 
+    else:
         f = open(fasta_file, "r")
     for line in f:
         if re.search(">.+", line):
@@ -276,10 +311,13 @@
             seq_id = m.group(1)
             # If there is a ".", take only first part of header.
             # This assumes ENSEMBL header format ">ENST00000631435.1 cdna ..."
-            if re.search(".+\..+", seq_id):
-                m = re.search("(.+?)\..+", seq_id)
-                seq_id = m.group(1)
-            assert seq_id not in seqs_dic, "non-unique FASTA header \"%s\" in \"%s\"" % (seq_id, fasta_file)
+            if short_ensembl:
+                if re.search(r".+\..+", seq_id):
+                    m = re.search(r"(.+?)\..+", seq_id)
+                    seq_id = m.group(1)
+            assert seq_id not in seqs_dic, \
+                "non-unique FASTA header \"%s\" in \"%s\"" \
+                % (seq_id, fasta_file)
             if ids_dic:
                 if seq_id in ids_dic:
                     seqs_dic[seq_id] = ""
@@ -290,25 +328,31 @@
                 m = re.search("([ACGTUN]+)", line, re.I)
                 seq = m.group(1)
                 if reject_lc:
-                    assert not re.search("[a-z]", seq), "lowercase characters detected in sequence \"%i\" (reject_lc=True)" %(seq_id)
+                    assert \
+                        not re.search("[a-z]", seq), \
+                        "lc char detected in seq \"%i\" (reject_lc=True)" \
+                        % (seq_id)
                 if convert_to_uc:
                     seq = seq.upper()
                 # If sequences with N nucleotides should be skipped.
                 if skip_n_seqs:
                     if "n" in m.group(1) or "N" in m.group(1):
-                        print ("WARNING: \"%s\" contains N nucleotides. Discarding sequence ... " % (seq_id))
+                        print("WARNING: \"%s\" contains N. Discarding "
+                              "sequence ... " % (seq_id))
                         del seqs_dic[seq_id]
                         continue
                 # Convert to RNA, concatenate sequence.
                 if read_dna:
-                    seqs_dic[seq_id] += m.group(1).replace("U","T").replace("u","t")
+                    seqs_dic[seq_id] += \
+                        m.group(1).replace("U", "T").replace("u", "t")
                 else:
-                    seqs_dic[seq_id] += m.group(1).replace("T","U").replace("t","u")
+                    seqs_dic[seq_id] += \
+                        m.group(1).replace("T", "U").replace("t", "u")
     f.close()
     return seqs_dic
 
 
-################################################################################
+###############################################################################
 
 def random_order_dic_keys_into_list(in_dic):
     """
@@ -322,7 +366,7 @@
     return id_list
 
 
-################################################################################
+###############################################################################
 
 def graphprot_get_param_string(params_file):
     """
@@ -348,19 +392,19 @@
                     if setting == "sequence":
                         param_string += "-onlyseq "
                 else:
-                    param_string += "-%s %s " %(par, setting)
+                    param_string += "-%s %s " % (par, setting)
             else:
-                assert 0, "pattern matching failed for string \"%s\"" %(param)
+                assert 0, "pattern matching failed for string \"%s\"" % (param)
     return param_string
 
 
-################################################################################
+###############################################################################
 
 def seqs_dic_count_uc_nts(seqs_dic):
     """
-    Count number of uppercase nucleotides in sequences stored in sequence 
+    Count number of uppercase nucleotides in sequences stored in sequence
     dictionary.
-    
+
     >>> seqs_dic = {'seq1': "acgtACGTacgt", 'seq2': 'acgtACacgt'}
     >>> seqs_dic_count_uc_nts(seqs_dic)
     6
@@ -376,13 +420,13 @@
     return c_uc
 
 
-################################################################################
+###############################################################################
 
 def seqs_dic_count_lc_nts(seqs_dic):
     """
-    Count number of lowercase nucleotides in sequences stored in sequence 
+    Count number of lowercase nucleotides in sequences stored in sequence
     dictionary.
-    
+
     >>> seqs_dic = {'seq1': "gtACGTac", 'seq2': 'cgtACacg'}
     >>> seqs_dic_count_lc_nts(seqs_dic)
     10
@@ -398,12 +442,12 @@
     return c_uc
 
 
-################################################################################
+###############################################################################
 
 def count_file_rows(in_file):
     """
     Count number of file rows for given input file.
-    
+
     >>> test_file = "test-data/test1.bed"
     >>> count_file_rows(test_file)
     7
@@ -418,7 +462,7 @@
     return row_count
 
 
-################################################################################
+###############################################################################
 
 def bed_check_six_col_format(bed_file):
     """
@@ -444,13 +488,13 @@
     return six_col_format
 
 
-################################################################################
+###############################################################################
 
 def bed_check_unique_ids(bed_file):
     """
-    Check whether .bed file (6 column format with IDs in column 4) 
+    Check whether .bed file (6 column format with IDs in column 4)
     has unique column 4 IDs.
-    
+
     >>> test_bed = "test-data/test1.bed"
     >>> bed_check_unique_ids(test_bed)
     True
@@ -468,7 +512,7 @@
         return True
 
 
-################################################################################
+###############################################################################
 
 def get_seq_lengths_from_seqs_dic(seqs_dic):
     """
@@ -483,7 +527,7 @@
     return seq_len_dic
 
 
-################################################################################
+###############################################################################
 
 def bed_get_region_lengths(bed_file):
     """
@@ -499,20 +543,24 @@
     id2len_dic = {}
     with open(bed_file) as f:
         for line in f:
-            row = line.strip()
             cols = line.strip().split("\t")
             site_s = int(cols[1])
             site_e = int(cols[2])
             site_id = cols[3]
             site_l = site_e - site_s
-            assert site_id not in id2len_dic, "column 4 IDs not unique in given .bed file \"%s\"" %(bed_file)
+            assert site_id \
+                not in id2len_dic, \
+                "column 4 IDs not unique in given .bed file \"%s\"" \
+                % (bed_file)
             id2len_dic[site_id] = site_l
     f.closed
-    assert id2len_dic, "No IDs read into dictionary (input file \"%s\" empty or malformatted?)" % (in_bed)
+    assert id2len_dic, \
+        "No IDs read into dic (input file \"%s\" empty or malformatted?)" \
+        % (bed_file)
     return id2len_dic
 
 
-################################################################################
+###############################################################################
 
 def graphprot_get_param_dic(params_file):
     """
@@ -522,7 +570,19 @@
 
     >>> params_file = "test-data/test.params"
     >>> graphprot_get_param_dic(params_file)
-    {'epochs': '20', 'lambda': '0.01', 'R': '1', 'D': '3', 'bitsize': '14', 'model_type': 'sequence', 'pos_train_ws_pred_median': '0.760321', 'pos_train_profile_median': '5.039610', 'pos_train_avg_profile_median_1': '4.236340', 'pos_train_avg_profile_median_2': '3.868431', 'pos_train_avg_profile_median_3': '3.331277', 'pos_train_avg_profile_median_4': '2.998667', 'pos_train_avg_profile_median_5': '2.829782', 'pos_train_avg_profile_median_6': '2.626623', 'pos_train_avg_profile_median_7': '2.447083', 'pos_train_avg_profile_median_8': '2.349919', 'pos_train_avg_profile_median_9': '2.239829', 'pos_train_avg_profile_median_10': '2.161676'}
+    {'epochs': '20', 'lambda': '0.01', 'R': '1', 'D': '3', 'bitsize': '14', \
+'model_type': 'sequence', 'pos_train_ws_pred_median': '0.760321', \
+'pos_train_profile_median': '5.039610', \
+'pos_train_avg_profile_median_1': '4.236340', \
+'pos_train_avg_profile_median_2': '3.868431', \
+'pos_train_avg_profile_median_3': '3.331277', \
+'pos_train_avg_profile_median_4': '2.998667', \
+'pos_train_avg_profile_median_5': '2.829782', \
+'pos_train_avg_profile_median_6': '2.626623', \
+'pos_train_avg_profile_median_7': '2.447083', \
+'pos_train_avg_profile_median_8': '2.349919', \
+'pos_train_avg_profile_median_9': '2.239829', \
+'pos_train_avg_profile_median_10': '2.161676'}
 
     """
     param_dic = {}
@@ -539,7 +599,7 @@
     return param_dic
 
 
-################################################################################
+###############################################################################
 
 def graphprot_filter_predictions_file(in_file, out_file,
                                       sc_thr=0):
@@ -554,16 +614,16 @@
             score = float(cols[2])
             if score < sc_thr:
                 continue
-            OUTPRED.write("%s\n" %(row))
+            OUTPRED.write("%s\n" % (row))
     f.close()
     OUTPRED.close()
 
 
-################################################################################
+###############################################################################
 
 def fasta_read_in_ids(fasta_file):
     """
-    Given a .fa file, read in header IDs in order appearing in file, 
+    Given a .fa file, read in header IDs in order appearing in file,
     and store in list.
 
     >>> test_file = "test-data/test3.fa"
@@ -582,19 +642,19 @@
     return ids_list
 
 
-################################################################################
+###############################################################################
 
-def graphprot_profile_calculate_avg_profile(in_file, out_file,
-                                            ap_extlr=5,
-                                            seq_ids_list=False,
-                                            method=1):
+def graphprot_profile_calc_avg_profile(in_file, out_file,
+                                       ap_extlr=5,
+                                       seq_ids_list=False,
+                                       method=1):
     """
-    Given a GraphProt .profile file, calculate average profiles and output 
+    Given a GraphProt .profile file, calculate average profiles and output
     average profile file.
-    Average profile means that the position-wise scores will get smoothed 
-    out by calculating for each position a new score, taking a sequence 
-    window -ap_extlr to +ap_extlr relative to the position 
-    and calculate the mean score over this window. The mean score then 
+    Average profile means that the position-wise scores will get smoothed
+    out by calculating for each position a new score, taking a sequence
+    window -ap_extlr to +ap_extlr relative to the position
+    and calculate the mean score over this window. The mean score then
     becomes the new average profile score at this position.
     Two different implementations of the task are given:
     method=1 (new python implementation, slower + more memory but easy to read)
@@ -604,14 +664,17 @@
     >>> out_file1 = "test-data/test2_1.avg_profile"
     >>> out_file2 = "test-data/test2_2.avg_profile"
     >>> out_file4 = "test-data/test2_3.avg_profile"
-    >>> graphprot_profile_calculate_avg_profile(in_file, out_file1, ap_extlr=2, method=1)
-    >>> graphprot_profile_calculate_avg_profile(in_file, out_file2, ap_extlr=2, method=2)
+    >>> graphprot_profile_calc_avg_profile(in_file, \
+    out_file1, ap_extlr=2, method=1)
+    >>> graphprot_profile_calc_avg_profile(in_file, \
+    out_file2, ap_extlr=2, method=2)
     >>> diff_two_files_identical(out_file1, out_file2)
     True
     >>> test_list = ["s1", "s2", "s3", "s4"]
     >>> out_file3_exp = "test-data/test3_added_ids_exp.avg_profile"
     >>> out_file3 = "test-data/test3_added_ids_out.avg_profile"
-    >>> graphprot_profile_calculate_avg_profile(in_file, out_file3, ap_extlr=2, method=1, seq_ids_list=test_list)
+    >>> graphprot_profile_calc_avg_profile(in_file, out_file3, \
+    ap_extlr=2, method=1, seq_ids_list=test_list)
     >>> diff_two_files_identical(out_file3_exp, out_file3)
     True
 
@@ -624,7 +687,7 @@
             for line in f:
                 cols = line.strip().split("\t")
                 site_id = int(cols[0])
-                pos = int(cols[1]) # 0-based.
+                pos = int(cols[1])  # 0-based.
                 score = float(cols[2])
                 # Store first position of site.
                 if site_id not in site_starts_dic:
@@ -635,14 +698,15 @@
                     lists_dic[site_id] = []
                     lists_dic[site_id].append(score)
         f.close()
-        # Check number of IDs (# FASTA sequence IDs has to be same as # site IDs).
+        # Check number of IDs (# FASTA IDs has to be same as # site IDs).
         if seq_ids_list:
             c_seq_ids = len(seq_ids_list)
             c_site_ids = len(site_starts_dic)
-            assert c_seq_ids == c_site_ids, "# sequence IDs != # site IDs (%i != %i)" %(c_seq_ids, c_site_ids)
+            assert c_seq_ids == c_site_ids, \
+                "# sequence IDs != # site IDs (%i != %i)" \
+                % (c_seq_ids, c_site_ids)
         OUTPROF = open(out_file, "w")
         # For each site, calculate average profile scores list.
-        max_list = []
         for site_id in lists_dic:
             # Convert profile score list to average profile scores list.
             aps_list = list_moving_window_average_values(lists_dic[site_id],
@@ -652,8 +716,8 @@
             if seq_ids_list:
                 site_id = seq_ids_list[site_id]
             for i, sc in enumerate(aps_list):
-                pos = i + start_pos + 1 # make 1-based.
-                OUTPROF.write("%s\t%i\t%f\n" %(site_id, pos, sc))
+                pos = i + start_pos + 1  # make 1-based.
+                OUTPROF.write("%s\t%i\t%f\n" % (site_id, pos, sc))
         OUTPROF.close()
     elif method == 2:
         OUTPROF = open(out_file, "w")
@@ -668,7 +732,7 @@
             for line in f:
                 cols = line.strip().split("\t")
                 cur_id = int(cols[0])
-                pos = int(cols[1]) # 0-based.
+                pos = int(cols[1])  # 0-based.
                 score = float(cols[2])
                 # Store first position of site.
                 if cur_id not in site_starts_dic:
@@ -677,16 +741,18 @@
                 if cur_id != old_id:
                     # Process old id scores.
                     if scores_list:
-                        aps_list = list_moving_window_average_values(scores_list,
-                                                                     win_extlr=ap_extlr)
+                        aps_list = \
+                            list_moving_window_average_values(
+                                scores_list,
+                                win_extlr=ap_extlr)
                         start_pos = site_starts_dic[old_id]
                         seq_id = old_id
                         # Get original FASTA sequence ID.
                         if seq_ids_list:
                             seq_id = seq_ids_list[old_id]
                         for i, sc in enumerate(aps_list):
-                            pos = i + start_pos + 1 # make 1-based.
-                            OUTPROF.write("%s\t%i\t%f\n" %(seq_id, pos, sc))
+                            pos = i + start_pos + 1  # make 1-based.
+                            OUTPROF.write("%s\t%i\t%f\n" % (seq_id, pos, sc))
                         # Reset list.
                         scores_list = []
                     old_id = cur_id
@@ -705,19 +771,19 @@
             if seq_ids_list:
                 seq_id = seq_ids_list[old_id]
             for i, sc in enumerate(aps_list):
-                pos = i + start_pos + 1 # make 1-based.
-                OUTPROF.write("%s\t%i\t%f\n" %(seq_id, pos, sc))
+                pos = i + start_pos + 1  # make 1-based.
+                OUTPROF.write("%s\t%i\t%f\n" % (seq_id, pos, sc))
         OUTPROF.close()
 
 
-################################################################################
+###############################################################################
 
 def graphprot_profile_extract_peak_regions(in_file, out_file,
                                            max_merge_dist=0,
                                            sc_thr=0):
     """
     Extract peak regions from GraphProt .profile file.
-    Store the peak regions (defined as regions with scores >= sc_thr) 
+    Store the peak regions (defined as regions with scores >= sc_thr)
     as to out_file in 6-column .bed.
 
     TODO:
@@ -735,7 +801,8 @@
     >>> graphprot_profile_extract_peak_regions(in_file, out_file, sc_thr=10)
     >>> diff_two_files_identical(out_file, empty_file)
     True
-    >>> graphprot_profile_extract_peak_regions(in_file, out_file, max_merge_dist=2)
+    >>> graphprot_profile_extract_peak_regions(in_file, out_file, \
+    max_merge_dist=2)
     >>> diff_two_files_identical(out_file, exp2_file)
     True
 
@@ -753,7 +820,7 @@
         for line in f:
             cols = line.strip().split("\t")
             cur_id = cols[0]
-            pos = int(cols[1]) # 0-based.
+            pos = int(cols[1])  # 0-based.
             score = float(cols[2])
             # Store first position of site.
             if cur_id not in site_starts_dic:
@@ -768,17 +835,21 @@
                 # Process old id scores.
                 if scores_list:
                     # Extract peaks from region.
-                    peak_list = list_extract_peaks(scores_list, 
-                                                   max_merge_dist=max_merge_dist,
-                                                   coords="bed",
-                                                   sc_thr=sc_thr)
+                    peak_list = \
+                        list_extract_peaks(scores_list,
+                                           max_merge_dist=max_merge_dist,
+                                           coords="bed",
+                                           sc_thr=sc_thr)
                     start_pos = site_starts_dic[old_id]
                     # Print out peaks in .bed format.
-                    for l in peak_list:
-                        peak_s = start_pos + l[0]
-                        peak_e = start_pos + l[1]
-                        site_id = "%s,%i" %(old_id, l[2])
-                        OUTPEAKS.write("%s\t%i\t%i\t%s\t%f\t+\n" %(old_id, peak_s, peak_e, site_id, l[3]))
+                    for ln in peak_list:
+                        peak_s = start_pos + ln[0]
+                        peak_e = start_pos + ln[1]
+                        site_id = "%s,%i" % (old_id, ln[2])
+                        OUTPEAKS.write("%s\t%i\t%i"
+                                       "\t%s\t%f\t+\n"
+                                       % (old_id, peak_s,
+                                          peak_e, site_id, ln[3]))
                     # Reset list.
                     scores_list = []
                 old_id = cur_id
@@ -790,33 +861,34 @@
     # Process last block.
     if scores_list:
         # Extract peaks from region.
-        peak_list = list_extract_peaks(scores_list, 
+        peak_list = list_extract_peaks(scores_list,
                                        max_merge_dist=max_merge_dist,
                                        coords="bed",
                                        sc_thr=sc_thr)
         start_pos = site_starts_dic[old_id]
         # Print out peaks in .bed format.
-        for l in peak_list:
-            peak_s = start_pos + l[0]
-            peak_e = start_pos + l[1]
-            site_id = "%s,%i" %(old_id, l[2]) # best score also 1-based.
-            OUTPEAKS.write("%s\t%i\t%i\t%s\t%f\t+\n" %(old_id, peak_s, peak_e, site_id, l[3]))
+        for ln in peak_list:
+            peak_s = start_pos + ln[0]
+            peak_e = start_pos + ln[1]
+            site_id = "%s,%i" % (old_id, ln[2])  # best score also 1-based.
+            OUTPEAKS.write("%s\t%i\t%i\t%s\t%f\t+\n"
+                           % (old_id, peak_s, peak_e, site_id, ln[3]))
     OUTPEAKS.close()
 
 
-################################################################################
+###############################################################################
 
 def list_extract_peaks(in_list,
                        max_merge_dist=0,
                        coords="list",
                        sc_thr=0):
     """
-    Extract peak regions from list. 
+    Extract peak regions from list.
     Peak region is defined as region >= score threshold.
-    
+
     coords=bed  :  peak start 0-based, peak end 1-based.
     coords=list :  peak start 0-based, peak end 0-based.
-    
+
     >>> test_list = [-1, 0, 2, 4.5, 1, -1, 5, 6.5]
     >>> list_extract_peaks(test_list)
     [[1, 4, 3, 4.5], [6, 7, 7, 6.5]]
@@ -862,7 +934,6 @@
             # Before was peak region?
             if inside:
                 # Store peak region.
-                #peak_infos = "%i,%i,%i,%f" %(pr_s, pr_e, pr_top_pos, pr_top_sc)
                 peak_infos = [pr_s, pr_e, pr_top_pos, pr_top_sc]
                 peak_list.append(peak_infos)
                 inside = False
@@ -898,7 +969,8 @@
                     if peak_list[i][3] < peak_list[j][3]:
                         new_top_pos = peak_list[j][2]
                         new_top_sc = peak_list[j][3]
-                    new_peak = [peak_list[i][0], peak_list[j][1], new_top_pos, new_top_sc]
+                    new_peak = [peak_list[i][0], peak_list[j][1],
+                                new_top_pos, new_top_sc]
                 # If two peaks were merged.
                 if new_peak:
                     merged_peak_list.append(new_peak)
@@ -915,15 +987,16 @@
     if coords == "bed":
         for i in range(len(peak_list)):
             peak_list[i][1] += 1
-            peak_list[i][2] += 1 # 1-base best score position too.
+            peak_list[i][2] += 1  # 1-base best score position too.
     return peak_list
 
 
-################################################################################
+###############################################################################
 
-def bed_peaks_to_genomic_peaks(peak_file, genomic_peak_file, genomic_sites_bed, print_rows=False):
+def bed_peaks_to_genomic_peaks(peak_file, genomic_peak_file, genomic_sites_bed,
+                               print_rows=False):
     """
-    Given a .bed file of sequence peak regions (possible coordinates from 
+    Given a .bed file of sequence peak regions (possible coordinates from
     0 to length of s), convert peak coordinates to genomic coordinates.
     Do this by taking genomic regions of sequences as input.
 
@@ -944,7 +1017,10 @@
             row = line.strip()
             cols = line.strip().split("\t")
             site_id = cols[3]
-            assert site_id not in id2row_dic, "column 4 IDs not unique in given .bed file \"%s\"" %(args.genomic_sites_bed)
+            assert site_id \
+                not in id2row_dic, \
+                "column 4 IDs not unique in given .bed file \"%s\"" \
+                % (genomic_sites_bed)
             id2row_dic[site_id] = row
     f.close()
 
@@ -958,10 +1034,13 @@
             site_e = int(cols[2])
             site_id2 = cols[3]
             site_sc = float(cols[4])
-            assert re.search(".+,.+", site_id2), "regular expression failed for ID \"%s\"" %(site_id2)
-            m = re.search(".+,(\d+)", site_id2)
-            sc_pos = int(m.group(1)) # 1-based.
-            assert site_id in id2row_dic, "site ID \"%s\" not found in genomic sites dictionary" %(site_id)
+            assert re.search(".+,.+", site_id2), \
+                "regular expression failed for ID \"%s\"" % (site_id2)
+            m = re.search(r".+,(\d+)", site_id2)
+            sc_pos = int(m.group(1))  # 1-based.
+            assert site_id in id2row_dic, \
+                "site ID \"%s\" not found in genomic sites dictionary" \
+                % (site_id)
             row = id2row_dic[site_id]
             rowl = row.split("\t")
             gen_chr = rowl[0]
@@ -974,21 +1053,23 @@
             if gen_pol == "-":
                 new_s = gen_e - site_e
                 new_e = gen_e - site_s
-                new_sc_pos = gen_e - sc_pos + 1 # keep 1-based.
-            new_row = "%s\t%i\t%i\t%s,%i\t%f\t%s" %(gen_chr, new_s, new_e, site_id, new_sc_pos, site_sc, gen_pol)
-            OUTPEAKS.write("%s\n" %(new_row))
+                new_sc_pos = gen_e - sc_pos + 1  # keep 1-based.
+            new_row = "%s\t%i\t%i\t%s,%i\t%f\t%s" \
+                      % (gen_chr, new_s, new_e,
+                         site_id, new_sc_pos, site_sc, gen_pol)
+            OUTPEAKS.write("%s\n" % (new_row))
             if print_rows:
                 print(new_row)
     OUTPEAKS.close()
 
 
-################################################################################
+###############################################################################
 
 def diff_two_files_identical(file1, file2):
     """
-    Check whether two files are identical. Return true if diff reports no 
+    Check whether two files are identical. Return true if diff reports no
     differences.
-    
+
     >>> file1 = "test-data/file1"
     >>> file2 = "test-data/file2"
     >>> diff_two_files_identical(file1, file2)
@@ -1006,6 +1087,4 @@
     return same
 
 
-################################################################################
-
-
+###############################################################################
author	bgruening
date	Wed, 27 Jan 2021 19:27:47 +0000
parents	20429f4c1b95
children	ddcf35a868b8