comparison lissero/variant4b.py @ 10:412d55f09755 draft

Uploaded
author estrain
date Thu, 08 Feb 2024 18:18:47 +0000
parents
children 40f397e29951
comparison
equal deleted inserted replaced
9:57d1d335ce88 10:412d55f09755
1 import sys
2
3 def identify_variants_with_genes(input_file_path, output_file_path):
4 # Define the genes of interest
5 genes_of_interest = ['LMO0737', 'ORF2110', 'ORF2819']
6
7 # Open the input file and read its lines
8 with open(input_file_path, 'r') as file:
9 lines = file.readlines()
10
11 # Check if the file has more than just the header
12 if len(lines) <= 1:
13 print("Input file does not contain enough data.")
14 return
15
16 # Extract the column headers and find the indices of the genes of interest
17 headers = lines[0].strip().split('\t')
18 gene_indices = [headers.index(gene) for gene in genes_of_interest]
19 serotype_index = headers.index('SEROTYPE')
20
21 # Initialize a list to hold the modified lines
22 modified_lines = [lines[0]] # Start with the header
23
24 # Process each data line in the input file
25 for line in lines[1:]:
26 data = line.strip().split('\t')
27 # Check if the genes of interest are all present (marked as "FULL")
28 if all(data[index] == 'FULL' for index in gene_indices):
29 # Modify the SEROTYPE column to "4b variant"
30 data[serotype_index] = "4b variant"
31 # Rejoin the modified data into a single string and add it to the list
32 modified_lines.append('\t'.join(data) + '\n')
33
34 # Write the modified lines to the output file
35 with open(output_file_path, 'w') as file:
36 file.writelines(modified_lines)
37
38 print(f'Results written to {output_file_path}')
39
40 if __name__ == "__main__":
41 if len(sys.argv) != 3:
42 print("Usage: python script.py <input_file_path> <output_file_path>")
43 sys.exit(1)
44
45 input_file_path = sys.argv[1]
46 output_file_path = sys.argv[2]
47 identify_variants_with_genes(input_file_path, output_file_path)
48