Mercurial > repos > charles-bernard > data_manager_build_alfa_indexes

--- a/data_manager_build_alfa_indexes/data_manager/data_manager_build_alfa_indexes.py	Thu Oct 27 09:17:41 2016 -0400
+++ b/data_manager_build_alfa_indexes/data_manager/data_manager_build_alfa_indexes.py	Thu Oct 27 10:11:12 2016 -0400
@@ -17,6 +17,7 @@
     parser = OptionParser()
     parser.add_option("-e", "--ensembl", dest = 'ensembl_info', action = "store", nargs = 2, metavar = ("kingdom", "species_name"), type = "str")
     parser.add_option("-o", "--output", dest='output_filename', action="store", nargs = 1, metavar = 'JSON_FILE')
+    parser.add_option("--log", dest='log_filename', action="store", nargs=1, metavar='log_report')
     (options, args) = parser.parse_args()
     return options, args

@@ -39,13 +40,13 @@


 def uncompress_gz(gz_file_name, uncompressed_file_name):
-    logging.info("____________________________________________________________")
-    logging.info("*** Uncompressing %s" % gz_file_name)
+    print("____________________________________________________________")
+    print("*** Uncompressing %s" % gz_file_name)
     uncompressed_file = open(uncompressed_file_name, 'wb')
     with gzip.open(gz_file_name, 'rb') as src_file:
         uncompressed_file.write(src_file.read())
     uncompressed_file.close()
-    logging.info("-> Uncompressed !\n")
+    print("-> Uncompressed !\n")


 def add_data_table_entry( data_manager_dict, data_table_entry ):
@@ -61,19 +62,19 @@


 def get_ensembl_url_root(kingdom):
-    logging.info("____________________________________________________________")
-    logging.info("*** Determining Ensembl ftp root url")
+    print("____________________________________________________________")
+    print("*** Determining Ensembl ftp root url")
     if kingdom == 'vertebrates':
         root = 'ftp://ftp.ensembl.org/pub/current_gtf/'
     else:
         root = 'ftp://ftp.ensemblgenomes.org/pub/%s/current/' % kingdom
-    logging.info("-> Determined !\n")
+    print("-> Determined !\n")
     return root


 def test_ensembl_species_exists(kingdom, url, species_name):
-    logging.info("____________________________________________________________")
-    logging.info ("*** Testing whether %s is referenced in Ensembl %s" % (species_name, kingdom))
+    print("____________________________________________________________")
+    print ("*** Testing whether %s is referenced in Ensembl %s" % (species_name, kingdom))
     list_species_file_name = 'species_Ensembl%s%s.txt' % (kingdom[0].upper(), kingdom[1:])
     if kingdom=='vertebrates':
         download_file(url, list_species_file_name)
@@ -95,9 +96,9 @@
         columns = species_lines[0].split('\t')
         found_species_name = columns[1]
         if species_name != found_species_name:
-            logging.info('-> \'%s\' has been replace with the complete species name \'%s\'' % (species_name, found_species_name))
+            print('-> \'%s\' has been replace with the complete species name \'%s\'' % (species_name, found_species_name))
             return found_species_name, species_lines_matched
-        logging.info("-> Referenced !\n")
+        print("-> Referenced !\n")
         return species_name, species_lines_matched
     else:
         list_species = [''] * nb_lines
@@ -106,7 +107,7 @@
             list_species[i] = columns[1]
             exact_match = re.search('^%s$' % species_name, list_species[i])
             if exact_match:
-                logging.info("-> Referenced !\n")
+                print("-> Referenced !\n")
                 return species_name, species_lines[i]
         msg = 'The string \'%s\' has been matched against the list of Ensembl Species but is not a complete species name.\nPlease retry with one of the following species names:\n%s' % (species_name, list_species[0:])
         logging.critical(msg)
@@ -114,26 +115,26 @@


 def get_ensembl_collection(kingdom, species_line):
-    logging.info("*** Extracting the %s_collection of the species" % kingdom)
+    print("*** Extracting the %s_collection of the species" % kingdom)
     collection_regex = re.compile('%s_.+_collection' % kingdom.lower())
     collection_match = re.search(collection_regex, species_line)
     if not collection_match:
-        logging.info("-> Skiped: this species is not classified in a Ensembl %s collection\n" % kingdom)
+        print("-> Skiped: this species is not classified in a Ensembl %s collection\n" % kingdom)
         return None
-    logging.info("-> Extracted !\n")
+    print("-> Extracted !\n")
     return collection_match.group(0)


 def get_ensembl_gtf_archive_name(url_dir, species_name):
-    logging.info("____________________________________________________________")
-    logging.info("*** Extracting the gtf archive name of %s" % species_name)
+    print("____________________________________________________________")
+    print("*** Extracting the gtf archive name of %s" % species_name)
     gtf_archive_regex = re.compile('%s\..*\.[0-9]+\.gtf\.gz' % species_name, flags = re.IGNORECASE)
     dir_content = get_page_content(url_dir)
     gtf_archive_match = re.search(gtf_archive_regex, dir_content)
     if not gtf_archive_match:
         sys.exit('The species is referenced on Ensembl but error of nomenclature led to download failure')
     gtf_archive_name = gtf_archive_match.group(0)
-    logging.info("-> Extracted !\n")
+    print("-> Extracted !\n")
     return gtf_archive_name


@@ -146,23 +147,23 @@
                 url = url + "%s/" % collection
     final_url = url + species_name + '/'
     gtf_archive_name = get_ensembl_gtf_archive_name(final_url, species_name)
-    logging.info("____________________________________________________________")
-    logging.info("*** Download the gtf archive of %s" % species_name)
+    print("____________________________________________________________")
+    print("*** Download the gtf archive of %s" % species_name)
     download_file(final_url + gtf_archive_name, gtf_archive_name)
-    logging.info("-> Downloaded !\n")
+    print("-> Downloaded !\n")
     return gtf_archive_name


 def generate_alfa_indexes(path_to_alfa, gtf_file_name):
-    logging.info("____________________________________________________________")
-    logging.info("*** Generating alfa indexes from %s" % gtf_file_name)
+    print("____________________________________________________________")
+    print("*** Generating alfa indexes from %s" % gtf_file_name)
     alfa_result = subprocess.Popen(['python', path_to_alfa, '-a',  gtf_file_name], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
     alfa_out, alfa_err = alfa_result.communicate()
     if alfa_err != None and not re.search('### End of program', alfa_err):
         msg = 'Generation Failed due an alfa error: %s' % (alfa_err)
         logging.critical(msg)
         sys.exit(msg)
-    logging.info("-> Generated !\n")
+    print("-> Generated !\n")


 def get_data_table_new_entry(gtf_archive_name):
@@ -197,8 +198,8 @@

     tmp_dir = tempfile.mkdtemp(prefix='tmp', suffix='', dir=path_to_tmp_dir)
     os.chdir(tmp_dir)
-    log_file_name = 'galaxy_log_report.log'
-    logging.basicConfig(level=logging.INFO, filename=log_file_name, filemode="a+", format='%(message)s')
+    #log_file_name = 'galaxy_log_report.log'
+    #logging.basicConfig(level=print, filename=log_file_name, filemode="a+", format='%(message)s')
     data_manager_dict = {}

     if options.ensembl_info:
@@ -215,26 +216,27 @@
         unstranded_index_name = '%s.unstranded.index' % data_table_entry['prefix']
         add_data_table_entry(data_manager_dict, data_table_entry)

-    logging.info("____________________________________________________________")
-    logging.info("*** General Info")
-    logging.info("TMP DIR:\t%s" % tmp_dir)
-    logging.info("TARGET DIR:\t%s" % target_directory)
-    logging.info("URL ROOT:\t%s" % url)
-    logging.info("SPECIES:\t%s" % data_table_entry['species'])
-    logging.info("VERSION:\t%s" % data_table_entry['version'])
-    logging.info("RELEASE:\t%s" % data_table_entry['release'])
-    logging.info("VALUE:\t%s" % data_table_entry['value'])
-    logging.info("DBKEY:\t%s" % data_table_entry['dbkey'])
-    logging.info("NAME:\t%s" % data_table_entry['name'])
-    logging.info("PREFIX:\t%s" % data_table_entry['prefix'])
-    logging.info("____________________________________________________________")
-    logging.info("*** Intial dictionary")
-    logging.info("%s" % params)
+    print("____________________________________________________________")
+    print("*** General Info")
+    print("TMP DIR:\t%s" % tmp_dir)
+    print("TARGET DIR:\t%s" % target_directory)
+    print("URL ROOT:\t%s" % url)
+    print("SPECIES:\t%s" % data_table_entry['species'])
+    print("VERSION:\t%s" % data_table_entry['version'])
+    print("RELEASE:\t%s" % data_table_entry['release'])
+    print("VALUE:\t%s" % data_table_entry['value'])
+    print("DBKEY:\t%s" % data_table_entry['dbkey'])
+    print("NAME:\t%s" % data_table_entry['name'])
+    print("PREFIX:\t%s" % data_table_entry['prefix'])
+    print("____________________________________________________________")
+    print("*** Intial dictionary")
+    print("%s" % params)


     shutil.copyfile(stranded_index_name, os.path.join(target_directory, stranded_index_name))
     shutil.copyfile(unstranded_index_name, os.path.join(target_directory, unstranded_index_name))
-    shutil.copyfile(log_file_name, os.path.join(target_directory, log_file_name))
+    #shutil.copyfile(log_file_name, os.path.join(target_directory, log_file_name))
+    #shutil.copyfile(log_file_name, options.log_filename)

     cleanup_before_exit(tmp_dir)