diff pair_fetch_DNA_ff.py @ 0:70f8259b0b30 draft

Uploaded
author arkarachai-fungtammasan
date Wed, 01 Apr 2015 16:48:58 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pair_fetch_DNA_ff.py	Wed Apr 01 16:48:58 2015 -0400
@@ -0,0 +1,75 @@
+#!/usr/bin/env python
+# pair_fetch_DNA_ff.py
+# Function: filter microsat and flanking region by quality score;
+# remove read with any base that has lower quality score than "quality_require" within "flanking_base" and convert from snoope to fastq
+# Note that require flanking length need to be screen by Bob snoope script first
+
+# Author: Arkarachai Fungtammasan
+# Version 1.0.0 (15 July 2012)
+# Input format: length_of_repeat[0] 	 left_flank_length[1]	right_flank_length[2]	repeat_motif[3]	hamming_distance[4]	read_name[5]	read_sequence[6]	read_quality[7]
+# Output format: two fastq file. First file contain left flank. Second file contain right flank.
+# Command: python pair_fetch_DNA_ff.py input.txt
+
+import sys
+from galaxy import eggs
+
+def stop_err(msg):
+    sys.stderr.write(msg)
+    sys.exit()
+    
+# read file name
+
+
+	
+filename=sys.argv[1]
+L_filename=sys.argv[2]
+R_filename=sys.argv[3]
+quality_require=sys.argv[4]
+flanking_base=sys.argv[5]
+try:
+	quality_require=int(quality_require)
+	flanking_base=int(flanking_base)
+except Exception, eee:
+	print eee
+	stop_err("Quality score cutoff and Length of flanking regions that require quality screening must be integer")
+	
+fd=open(filename)
+fdd1=open(L_filename,'w')
+fdd2=open(R_filename,'w')
+lines=fd.xreadlines()
+for line in lines:
+    temp=line.strip().split('\t')
+    temp=filter(None,temp)
+    #get index
+    left_flank=(0,int(temp[1]))
+    microsat=(int(temp[1]),int(temp[1])+int(temp[0]))
+    right_flank=(int(temp[1])+int(temp[0]),int(temp[1])+int(temp[0])+int(temp[2]))
+    flag=0
+    #filter length of left and right flank
+    if (right_flank[1]-right_flank[0])<flanking_base:
+    	continue
+    if (left_flank[1]-left_flank[0])<flanking_base:
+    	continue
+    #filter quality score
+    for i in temp[7][microsat[0]-flanking_base:microsat[1]+flanking_base]:
+        if ord(i)<(quality_require+33):
+            flag=1
+        else:
+            flag=flag
+    #print out to seperated files
+    if flag ==0:
+        newname= temp[5]##+'_'+temp[3]+'_'+temp[0]
+        fdd1.writelines('@'+newname+'\n')
+        fdd2.writelines('@'+newname+'\n')
+        fdd1.writelines(temp[6][left_flank[0]:left_flank[1]]+'\n')
+        fdd2.writelines(temp[6][right_flank[0]:right_flank[1]]+'\n')
+        fdd1.writelines('+'+newname+'\n')
+        fdd2.writelines('+'+newname+'\n')
+        fdd1.writelines(temp[7][left_flank[0]:left_flank[1]]+'\n')
+        fdd2.writelines(temp[7][right_flank[0]:right_flank[1]]+'\n')
+
+fd.close()
+fdd1.close()
+fdd2.close()
+
+