annotate prophage_finder.py @ 2:8674f554d76b draft

Uploaded
author pedro_araujo
date Fri, 29 Jan 2021 16:01:12 +0000
parents e4b3fc88efe0
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
1 import ast
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
2 import json
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
3 import os
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
4
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
5 import pandas as pd
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
6
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
7 from FeatureConstruction import *
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
8
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
9
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
10 def phages_bact():
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
11 count_bacteria = 0
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
12 for phage in data.index:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
13 if ast.literal_eval(data.loc[phage, 'Host_ID']):
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
14 count_bacteria += 1
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
15 return count_bacteria
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
16
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
17
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
18 data = pd.read_csv('files/NCBI_Phage_Bacteria_Data.csv', header=0, index_col=0)
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
19
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
20 with open('C:/Users/Pedro/Downloads/pha_in_bac_2_test.json', encoding='utf-8') as F:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
21 prophage = json.loads(F.read())
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
22
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
23 for bact in prophage.keys():
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
24 for phage in prophage[bact]:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
25 if phage in data.index:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
26 temp = ast.literal_eval(data.loc[phage, 'Host_ID'])
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
27 if bact + '.1' not in temp:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
28 temp.append(bact+'.1')
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
29 data.loc[phage, 'Host_ID'] = str(temp)
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
30
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
31 data.to_csv('files/NCBI_Phage_Bacteria_Data.csv')
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
32
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
33 fc = FeatureConstruction()
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
34 phageTails = fc.phageTails
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
35
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
36 os.system('cd-hit -i files/tails.fasta -o files/cdhit')
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
37
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
38 temp_cluster = []
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
39 with open('files/cdhit.clstr', 'r') as f:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
40 for line in f.readlines():
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
41 if '>Cluster' in line:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
42 for prot in temp_cluster:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
43 for phage in phageTails:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
44 if prot in phageTails[phage].keys():
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
45 if phage in data.index:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
46 temp_ref = ast.literal_eval(data.loc[ref_phage, 'Bacteria ID'])
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
47 temp = ast.literal_eval(data.loc[phage, 'Bacteria ID'])
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
48 for i in temp_ref:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
49 if i not in temp:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
50 temp.append(i)
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
51 data.loc[phage, 'Bacteria ID'] = str(temp)
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
52 break
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
53 temp_cluster = []
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
54 elif line[0] == '0':
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
55 pos_i = line.find('>') + 1
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
56 pos_f = line.find('...')
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
57 ref_prot = line[pos_i:pos_f]
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
58 for phage in phageTails:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
59 if ref_prot in phageTails[phage].keys():
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
60 ref_phage = phage
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
61 break
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
62 else:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
63 pos_i = line.find('>') + 1
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
64 pos_f = line.find('...')
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
65 temp_cluster.append(line[pos_i:pos_f])
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
66
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
67
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
68 with open('files/bactDNA.json', encoding='utf-8') as F:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
69 bacProt = json.loads(F.read())
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
70
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
71 listDone = []
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
72 for bact in bacProt:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
73 if bact in listDone:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
74 pass
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
75 else:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
76 listDone.append(bact)
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
77 with open('files/temp_genome.fasta', 'w') as F:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
78 F.write('>' + bact + '\n' + bacProt[bact] + '\n')
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
79 os.system('phigaro -f files/temp_genome.fasta --not-open -d -o files/temp_phigaro') # Phigaro
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
80 with open('files/temp_phigaro.html', 'r') as Ph:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
81 tempPhigaro = Ph.readlines()
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
82 for line in tempPhigaro:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
83 if '<div class="accordion-body collapse"' in line:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
84 VOGs = line[line.find('>')+1:].strip('\n').split(', ')
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
85 for vog in VOGs:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
86 with open('files/VOG_tables/' + vog + '.txt', 'r') as f:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
87 temp_phages = f.readlines()
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
88 for i in range(len(temp_phages)):
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
89 if i != 0:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
90 phage = temp_phages[i].split('\t')[2]
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
91 if phage in data.index:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
92 temp = ast.literal_eval(data.loc[phage, 'Bacteria ID'])
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
93 if bact not in temp:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
94 temp.append(bact)
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
95 data.loc[phage, 'Bacteria ID'] = str(temp)
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
96 print('Number of phages with associated bacteria strains:', phages_bact(), end="\r")
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
97
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
98 '''os.system('wget --post-file="files/temp_genome.fasta" "http://phaster.ca/phaster_api?contigs=1" -O files/temp_phaster') # Phaster
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
99 with open('files/temp_phaster', encoding='utf-8') as F:
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
100 temp = json.loads(F.read())
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
101 os.system('wget "http://phaster.ca/phaster_api?acc=' + temp['job_id'] + 'Z" -O files/temp_phaster') # servidor cheio
e4b3fc88efe0 Uploaded
pedro_araujo
parents:
diff changeset
102 os.system('PhiSpy.py files/temp_genome.fasta -o files/temp_phipsy') # Phipsy - não possível com fastas'''