diff cut_degen.py @ 0:716cdcccc919 draft default tip

planemo upload for repository https://github.com/mshortr/degenerateprimerremoval_fastq commit 7d51a7e3ccb0164b530bf1664068d86241f2f2f5-dirty
author megan-shortridge
date Tue, 15 Sep 2015 14:14:41 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cut_degen.py	Tue Sep 15 14:14:41 2015 -0400
@@ -0,0 +1,95 @@
+#This is a version that is meant to work with Fastq files
+import Bio
+from Bio import SeqUtils
+from Bio import SeqIO
+import sys
+
+
+#for records and adapter, should be sys.argv[1 and 2]
+fastqfile = sys.argv[1] #This is the input fasta file
+adapter = sys.argv[2] #This is the input adapter as a string
+keepreads = sys.argv[3] #True or false, this will determine whether or not reads are kept. If true, it will keep reads that do not have the adapter in it. If false, it will get rid of those reads.
+removeadapters = sys.argv[4] #True or false, if this is True, the adapters will be removed. If true, removes the adapters from the sequences. If false, it keeps them.
+end_defn = sys.argv[5] #If 5, the primer is removed from the 5' end of the sequence. If 3, then it is removed from the 3' end of the sequence.
+adapter_name = sys.argv[6] #This is the name of the adapter that you can put into the output text file.
+
+
+# Here is the command for the test:  python cut_degen.py 'test.fastq' 'GAACWAYWYCT' 'True' 'True'  '5' 'test'
+
+keepreads = str(keepreads)
+removeadapters = str(removeadapters)
+fastqfile=str(fastqfile)
+end_defn = str(end_defn)
+
+fh = open(fastqfile, mode='r+')
+len_adapter = len(adapter)
+count_adapter_found = 0
+count_adapter_not_found = 0
+total_seq_count = 0
+
+parsed = SeqIO.parse(fh, format="fastq")
+
+output_fh_name = "output.fastq"
+
+if fastqfile=="test3prime.fastq":
+    output_fh_name="output2.fastq"
+
+output_fh = open(output_fh_name, mode='w+')
+
+output_text_name = "output.txt"
+if fastqfile=="test3prime.fastq":
+    output_text_name="output2.txt"
+output_text_fh = open(output_text_name, mode='w+')
+
+
+for record in parsed:
+    try:
+        sequence = str(record.seq)
+        search = SeqUtils.nt_search(sequence, adapter) #This will search the
+        index = int(search[1]) #If it finds the adapter, is the starting index from which it was found.
+        adapter_start = index
+        adapter_end = index+len_adapter
+        count_adapter_found +=1
+        total_seq_count+=1
+        if removeadapters == "True": #if the value is true, it removes the adapters from the sequences.
+            if end_defn=="5":
+                record = record[adapter_end:] #If a 5' adapter, you remove adapter from beginning
+            elif end_defn=="3":
+                record = record[:adapter_start] #If it is a 3' adapter, you remove the adapter at the end
+        elif removeadapters == "False": #if the value is false, it does not remove the adapters from the sequences.
+            record = record
+        SeqIO.write(record, output_fh, format="fastq") #No matter what, write the reads.
+    except IndexError:
+        count_adapter_not_found+=1
+        total_seq_count+=1
+        record = record
+        if keepreads=="True":
+            SeqIO.write(record, output_fh, format="fastq")
+        elif keepreads=="False":
+            pass
+        else:
+            pass
+
+output_fh.close()
+
+percent_cut = 100*(float(count_adapter_found)/float(total_seq_count))
+
+
+output_text_fh.write("The total number of sequences that were analyzed was %i.\n\n"%total_seq_count)
+output_text_fh.write("Adapter was found and removed for %i sequences (%i%% of total).\n\n"%(count_adapter_found, percent_cut))
+
+if keepreads =="True":
+    output_text_fh.write("Sequences that did not contain the adapter were kept.\n\n")
+elif keepreads=="False":
+    output_text_fh.write("Sequences that did not contain the adapter were removed from the dataset.\n\n")
+if removeadapters=="True":
+    output_text_fh.write("The adapters were removed from the dataset.\n\n")
+elif removeadapters=="False":
+    output_text_fh.write("The adapters were not removed from the dataset.\n\n")
+if end_defn=="5":
+    output_text_fh.write("Adapters were removed from the 5\' end.\n\n")
+elif end_defn=="3":
+    output_text_fh.write("Adapters were removed from the 3\'end.\n\n")
+
+output_text_fh.write("The name of the adapter that was removed was named %s, and had the sequence %s.\n\n"%(adapter_name,adapter))
+output_text_fh.close()
\ No newline at end of file