1
|
1 # input a genome file and return a file genome.chrom.sizes to be associated with the custom build (or just have it as an output to be used later in the history.
|
|
2 # adapted from https://bioexpressblog.wordpress.com/2014/04/15/calculate-length-of-all-sequences-in-an-multi-fasta-file/
|
|
3 from sys import argv
|
|
4 # python calculating_chrom.sizes.py genome_input.fa output.chrom.sizes
|
6
|
5 fasta_source = str(argv[1])
|
15
|
6 prefix = str(argv[2])
|
|
7 output = str(argv[3])
|
|
8 genome = str(argv[4])
|
|
9 builtin = str(argv[5])
|
|
10
|
1
|
11 # genome = 'test-data/test.fasta'
|
|
12 # output = "test-data/test_chrom.sizes"
|
8
|
13 print(fasta_source, genome, builtin, prefix, output)
|
|
14 if fasta_source == 'builtin':
|
|
15 genome = builtin
|
1
|
16
|
|
17 chromSizesoutput = open(output,"w")
|
|
18
|
|
19 records = []
|
|
20 record = False
|
|
21 for line in open(genome, 'r').readlines():
|
|
22 if line[0] == '>':
|
|
23 if record:
|
|
24 records.append(record)
|
|
25 record = [line.strip("\n").split(' ')[0][1:], 0]
|
|
26
|
|
27 else:
|
|
28 sequence = line.strip('\n')
|
|
29 record[1] += len(sequence)
|
3
|
30
|
|
31 if record not in records:
|
|
32 records.append(record)
|
|
33
|
1
|
34 for seq_record in records:
|
4
|
35 if prefix != 'none':
|
|
36 output_line = f"{prefix}{seq_record[0]}\t{seq_record[1]}\n"
|
|
37 else:
|
|
38 output_line = f"{seq_record[0]}\t{seq_record[1]}\n"
|
|
39
|
1
|
40 chromSizesoutput.write(output_line)
|
|
41
|
|
42 chromSizesoutput.close()
|