annotate phage_host_prediction/prophage_finder.py @ 2:3e1e8be4e65c draft default tip

Uploaded
author pedro_araujo
date Fri, 02 Apr 2021 10:11:13 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
2
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
1 import ast
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
2 import json
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
3 import os
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
4
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
5 import pandas as pd
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
6
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
7 from FeatureConstruction import *
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
8
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
9
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
10 def phages_bact():
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
11 count_bacteria = 0
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
12 for phage in data.index:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
13 if ast.literal_eval(data.loc[phage, 'Host_ID']):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
14 count_bacteria += 1
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
15 return count_bacteria
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
16
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
17
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
18 data = pd.read_csv('files/NCBI_Phage_Bacteria_Data.csv', header=0, index_col=0)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
19
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
20 with open('C:/Users/Pedro/Downloads/pha_in_bac_2_test.json', encoding='utf-8') as F:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
21 prophage = json.loads(F.read())
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
22
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
23 for bact in prophage.keys():
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
24 for phage in prophage[bact]:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
25 if phage in data.index:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
26 temp = ast.literal_eval(data.loc[phage, 'Host_ID'])
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
27 if bact + '.1' not in temp:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
28 temp.append(bact+'.1')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
29 data.loc[phage, 'Host_ID'] = str(temp)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
30
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
31 data.to_csv('files/NCBI_Phage_Bacteria_Data.csv')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
32
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
33 fc = FeatureConstruction()
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
34 phageTails = fc.phageTails
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
35
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
36 os.system('cd-hit -i files/tails.fasta -o files/cdhit')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
37
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
38 temp_cluster = []
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
39 with open('files/cdhit.clstr', 'r') as f:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
40 for line in f.readlines():
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
41 if '>Cluster' in line:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
42 for prot in temp_cluster:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
43 for phage in phageTails:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
44 if prot in phageTails[phage].keys():
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
45 if phage in data.index:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
46 temp_ref = ast.literal_eval(data.loc[ref_phage, 'Bacteria ID'])
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
47 temp = ast.literal_eval(data.loc[phage, 'Bacteria ID'])
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
48 for i in temp_ref:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
49 if i not in temp:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
50 temp.append(i)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
51 data.loc[phage, 'Bacteria ID'] = str(temp)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
52 break
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
53 temp_cluster = []
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
54 elif line[0] == '0':
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
55 pos_i = line.find('>') + 1
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
56 pos_f = line.find('...')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
57 ref_prot = line[pos_i:pos_f]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
58 for phage in phageTails:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
59 if ref_prot in phageTails[phage].keys():
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
60 ref_phage = phage
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
61 break
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
62 else:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
63 pos_i = line.find('>') + 1
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
64 pos_f = line.find('...')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
65 temp_cluster.append(line[pos_i:pos_f])
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
66
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
67
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
68 with open('files/bactDNA.json', encoding='utf-8') as F:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
69 bacProt = json.loads(F.read())
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
70
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
71 listDone = []
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
72 for bact in bacProt:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
73 if bact in listDone:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
74 pass
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
75 else:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
76 listDone.append(bact)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
77 with open('files/temp_genome.fasta', 'w') as F:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
78 F.write('>' + bact + '\n' + bacProt[bact] + '\n')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
79 os.system('phigaro -f files/temp_genome.fasta --not-open -d -o files/temp_phigaro') # Phigaro
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
80 with open('files/temp_phigaro.html', 'r') as Ph:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
81 tempPhigaro = Ph.readlines()
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
82 for line in tempPhigaro:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
83 if '<div class="accordion-body collapse"' in line:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
84 VOGs = line[line.find('>')+1:].strip('\n').split(', ')
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
85 for vog in VOGs:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
86 with open('files/VOG_tables/' + vog + '.txt', 'r') as f:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
87 temp_phages = f.readlines()
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
88 for i in range(len(temp_phages)):
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
89 if i != 0:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
90 phage = temp_phages[i].split('\t')[2]
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
91 if phage in data.index:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
92 temp = ast.literal_eval(data.loc[phage, 'Bacteria ID'])
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
93 if bact not in temp:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
94 temp.append(bact)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
95 data.loc[phage, 'Bacteria ID'] = str(temp)
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
96 print('Number of phages with associated bacteria strains:', phages_bact(), end="\r")
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
97
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
98 '''os.system('wget --post-file="files/temp_genome.fasta" "http://phaster.ca/phaster_api?contigs=1" -O files/temp_phaster') # Phaster
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
99 with open('files/temp_phaster', encoding='utf-8') as F:
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
100 temp = json.loads(F.read())
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
101 os.system('wget "http://phaster.ca/phaster_api?acc=' + temp['job_id'] + 'Z" -O files/temp_phaster') # servidor cheio
3e1e8be4e65c Uploaded
pedro_araujo
parents:
diff changeset
102 os.system('PhiSpy.py files/temp_genome.fasta -o files/temp_phipsy') # Phipsy - não possível com fastas'''