13
+ − 1 #!/usr/bin/env python
+ − 2 def get_contig_lengths(in_fai):
+ − 3 """ Get contig lengths from a fasta index. """
+ − 4 lens = dict()
+ − 5 with open(in_fai, 'r') as f:
+ − 6 for line in f:
+ − 7 L1 = line.rstrip().split('\t')
+ − 8 lens[L1[0]] = int(L1[1])
+ − 9 return lens
+ − 10
+ − 11
+ − 12 def get_contig_orderings(in_groupings):
+ − 13 """
+ − 14 From the orderings files, return the following:
+ − 15
+ − 16 1. Dictionary associating a reference header with a list of ordered contig headers
+ − 17 2. Dictionary associating a contig with an orientation
+ − 18 3. Dicitonary associating a contig with its assigned reference header
+ − 19 """
+ − 20 orderings = dict()
+ − 21 orientations = dict()
+ − 22 ref_chr = dict()
+ − 23
+ − 24 # Iterate through the orderings files
+ − 25 with open(in_groupings, 'r') as f1:
+ − 26 for line1 in f1:
+ − 27 header = line1.rstrip().replace('_orderings.txt', '_RaGOO')
+ − 28 header = header[header.rfind('/') + 1:]
+ − 29
+ − 30 # Initialize the list for the orderings
+ − 31 orderings[header] = []
+ − 32 with open(line1.rstrip(), 'r') as f2:
+ − 33 for line2 in f2:
+ − 34 L1 = line2.rstrip().split('\t')
+ − 35 orderings[header].append(L1[0])
+ − 36 orientations[L1[0]] = L1[1]
+ − 37 ref_chr[L1[0]] = header
+ − 38 return orderings, orientations, ref_chr
+ − 39
+ − 40
+ − 41 def get_reverse_coords(start_coord, end_coord, seq_length):
+ − 42 """
+ − 43 Returns new genomic coordinates of a region that has undergone reverse complementation.
+ − 44 new start coordinate = seqLength - endCoord
+ − 45 new end coordinate = seqLength - startCoord
+ − 46 """
+ − 47 return seq_length - (end_coord - 1), seq_length - (start_coord - 1)
+ − 48
+ − 49
+ − 50 if __name__ == "__main__":
+ − 51 import argparse
+ − 52
+ − 53 parser = argparse.ArgumentParser(description='Lift-over gff coordinates to from contigs to RaGOO pseudomolecules.'
+ − 54 'Please make sure that you are using the updated gff file and set of contigs if chimera correction was used.'
+ − 55 'Also make sure that the gap size (-g) matches that which was used during ordering and orienting.')
+ − 56 parser.add_argument("gff", metavar="<genes.gff>", type=str, help="Gff file to be lifted-over")
+ − 57 parser.add_argument("orderings", metavar="<orderings.fofn>", type=str, help="List of RaGOO 'orderings' files. 'ls ragoo_output/orderings/* > orderings.fofn'")
+ − 58 parser.add_argument("fai", metavar="<contigs.fasta.fai>", type=str, help="Fasta index for contigs.'")
+ − 59 parser.add_argument("-g", metavar="100", type=int, default=100, help="Gap size for padding in pseudomolecules (must match what was used for 'ragoo.py'.")
+ − 60
+ − 61
+ − 62 # Get the command line arguments
+ − 63 args = parser.parse_args()
+ − 64 gff_file = args.gff
+ − 65 orderings_file = args.orderings
+ − 66 fai_file = args.fai
+ − 67 gap_size = args.g
+ − 68
+ − 69 # Get the contig lengths
+ − 70 ctg_lens = get_contig_lengths(fai_file)
+ − 71
+ − 72 # Get the contig orderings and orientations
+ − 73 ctg_orderings, ctg_orientations, ctg_chr = get_contig_orderings(orderings_file)
+ − 74
+ − 75 #Iterate through the GFF features and update
+ − 76 offsets = dict()
+ − 77 with open(gff_file) as f:
+ − 78 for gff_line in f:
+ − 79 if gff_line.startswith("#"):
+ − 80 print(gff_line.rstrip())
+ − 81 else:
+ − 82 gff_fields = gff_line.rstrip().split('\t')
+ − 83 gff_header, start, end, strand = gff_fields[0], int(gff_fields[3]), int(gff_fields[4]), gff_fields[6]
+ − 84 new_header = ctg_chr[gff_header]
+ − 85
+ − 86 # Check that this contig header is in our list of headers
+ − 87 if gff_header not in ctg_lens.keys():
+ − 88 err_msg = """ %s was not found in the list of orderings files provided.
+ − 89 Please check that you are using the correct files. If chimeric contig correction was
+ − 90 used, please use the corrected gff and fai file.
+ − 91 """
+ − 92 raise ValueError(err_msg)
+ − 93
+ − 94 # Check if the contig has been reverse complemented. Update accordingly
+ − 95 if ctg_orientations[gff_header] == '-':
+ − 96 start, end = get_reverse_coords(start, end, ctg_lens[gff_header])
+ − 97
+ − 98 # Set the opposite strand
+ − 99 if strand == '+':
+ − 100 strand = '-'
+ − 101 else:
+ − 102 strand = '+'
+ − 103
+ − 104 # Check if the offset for this contig has already been calculated
+ − 105 if gff_header in offsets:
+ − 106 offset = offsets[gff_header]
+ − 107 else:
+ − 108 ctg_idx = ctg_orderings[new_header].index(gff_header)
+ − 109 offset = 0
+ − 110
+ − 111 for ctg in ctg_orderings[new_header][:ctg_idx]:
+ − 112 offset += ctg_lens[ctg]
+ − 113 offset += gap_size
+ − 114
+ − 115 # memoize the offset
+ − 116 offsets[gff_header] = offset
+ − 117
+ − 118 new_start = start + offset
+ − 119 new_end = end + offset
+ − 120
+ − 121 gff_fields[0] = new_header
+ − 122 gff_fields[3] = str(new_start)
+ − 123 gff_fields[4] = str(new_end)
+ − 124 gff_fields[6] = strand
+ − 125 print('\t'.join(gff_fields))