annotate NGSrich_0.5.5/src/filters/ReadFilter.java @ 0:89ad0a9cca52 default tip

Uploaded
author pfrommolt
date Mon, 21 Nov 2011 08:12:19 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
1 package filters;
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
2
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
3 import java.io.File;
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
4 import java.io.FileNotFoundException;
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
5 import java.io.FileWriter;
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
6 import java.io.IOException;
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
7 import java.util.Scanner;
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
8 import datastructures.ReadLine;
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
9
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
10 public class ReadFilter extends Filter{
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
11
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
12 File input, output;
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
13
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
14 /**
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
15 * Constructs a SamAdapter object. The output of an adaption is written to the
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
16 * given file outputFileName.
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
17 *
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
18 * @param inputFileName the name of the read alignment input file.
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
19 * @param outputFileName the name of the output file containing the reduced
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
20 * format of the read alignment file. It must ends with ".red".
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
21 */
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
22 public ReadFilter(String inputFileName, String outputFileName) {
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
23 super(inputFileName, outputFileName);
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
24 input = new File(inputFileName);
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
25 output = new File(outputFileName);
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
26 }
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
27
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
28 /**
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
29 * <P>
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
30 * Uses ReadLine zu reduce each line of the read alignment file to following format:<BR>
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
31 * &#60;name> &#60;chrom> &#60;start> &#60;end> (tab delimited).
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
32 * </P>
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
33 * In the following we list the 12 fields of the sam-alignment-file. We mark the fields we are
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
34 * interessted in with (!!):
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
35 * <PRE>
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
36 * 1. &#60;QNAME> : Query pair NAME if paired; or Query NAME if unpaired (Ex: 6:105:18438:14421) (!!)
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
37 * 2. &#60;FLAG> : bitwise FLAG a₀a₁a₂a₃a₄a₅a₆a₇a₈a₉a₁₀ (Ex: 0 forward, 16 reverse strand)
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
38 * a₀ : the read is paired in sequencing, (no matter whether it is mapped in a pair)
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
39 * a₁ : the read is mapped in a proper pair
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
40 * a₂ : the query sequence itself is unmapped
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
41 * a₃ : the mate is unmapped
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
42 * a₄ : strand of the query (0 for forward; 1 for reverse strand)
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
43 * a₅ : strand of the mate
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
44 * a₆ : the read is the first read in a pair
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
45 * a₇ : the read is the second read in a pair
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
46 * a₈ : the alignment is not primary
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
47 * a₉ : the read fails platform/vendor quality checks
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
48 * a₁₀: the read is either a PCR duplicate or an optical duplicate
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
49 * 3. &#60;RNAME> : Reference sequence NAME (Ex: chr10) (!!)
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
50 * 4. &#60;POS> : 1-based leftmost POSition/coordinate of the clipped sequence (Ex: 60041) (!!)
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
51 * 5. &#60;MAPQ> : MAPping Quality (Ex: 0)
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
52 * (phred-scaled posterior probability that the mapping position of this read is incorrect)
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
53 * 6. &#60;CIGAR> : extended CIGAR string (Ex: 150M)
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
54 * 7. &#60;MRNM> : Mate Reference sequence NaMe; “=” if the same as &#60;RNAME> (Ex:*)
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
55 * 8. &#60;MPOS> : 1-based leftmost Mate POSition of the clipped sequence (Ex: 0)
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
56 * 9. &#60;ISIZE> : inferred Insert SIZE (Ex: 0)
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
57 * 10. &#60;SEQ> : query SEQuence; “=” for a match to the reference; n/N/. for ambiguity; cases are not maintained (!!)
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
58 * (Ex: TGTTGTTGTTATTTCTGAATGACATTTACTTTGCTGCTCTTTATTTTGCG
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
59 * TATTTAAAACTATTAGATCGTGTGATTATATTTGACAGGTCTTAATTGAC
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
60 * GCGCTGTTCAGCCCTTTGAGTTCGGTTGAGTTTTGTGTTGGAGAATTTTC)
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
61 * 11. &#60;QUAL> : query QUALity; ASCII-33 gives the Phred base quality
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
62 * (Ex: /.8349-7:95@=8999;1:=;===AABD:=@A;>AD:E:9@==69<;@B3CBC@B8B;B89=8=3;@@@.:->>B?
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
63 * C4CBB8EDGDD8GDEEDEEE8EBA9B???=B;,8:+5;;A??>?#############################)
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
64 * 12. [&#60;TAG>:&#60;VTYPE>:&#60;VALUE> [...]]: TAG/Value TYPE/match <VTYPE> (space allowed)
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
65 * (Ex: XT:A:R NM:i:2 X0:i:2 X1:i:0)
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
66 * </PRE>
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
67 */
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
68 public void filter() {
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
69 FileWriter fw = null;
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
70 Scanner s = null;
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
71
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
72 try {
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
73 s= new Scanner(input);
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
74 } catch (FileNotFoundException e) {
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
75 System.err.println("sam file not found");
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
76 e.printStackTrace();
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
77 }
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
78
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
79 try {
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
80 if(output == null){
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
81 output = new File(input.getName().
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
82 substring(0,input.getName().lastIndexOf("."))+".rsam");
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
83 }
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
84
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
85 fw = new FileWriter(output);
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
86
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
87 } catch (IOException e) {
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
88 System.err.println("Error generating rsam file");
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
89 e.printStackTrace();
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
90 }
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
91
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
92 String rawline;
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
93 ReadLine line = null;
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
94
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
95 do{
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
96 rawline = s.nextLine();
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
97 }while(rawline.startsWith("@"));
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
98
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
99 do{
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
100 try {
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
101 line = new ReadLine(rawline);
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
102 fw.write(line+"\r\n");
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
103 } catch (IOException e) {
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
104 System.err.println("Error writing reduced form of:\n"+rawline);
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
105 e.printStackTrace();
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
106 }
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
107 if(s.hasNextLine())
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
108 rawline = s.nextLine();
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
109 }while(s.hasNextLine());
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
110
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
111
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
112 try {
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
113 fw.write(line +"\r\n");
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
114 } catch (IOException e) {
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
115 System.err.println("Error writing reduced form of:\n"+line);
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
116 e.printStackTrace();
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
117 }
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
118
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
119 try {
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
120 fw.close();
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
121 } catch (IOException e) {
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
122 System.err.println("Error closing file");
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
123 e.printStackTrace();
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
124 }
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
125 s.close();
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
126
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
127 System.out.println("READS FILE:");
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
128 System.out.println(input.getAbsolutePath()+" reduced to "+
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
129 output.getAbsolutePath());
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
130 sort();
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
131 }
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
132
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
133
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
134 public void sort() {
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
135 Runtime rt = Runtime.getRuntime();
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
136 try {
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
137 String rawOutput = output.getAbsolutePath();
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
138 String outputName = output.getName();
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
139 String pathname = output.getParentFile().getAbsolutePath()+"/"+outputName+"Sorted";
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
140
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
141 output = new File(pathname);
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
142 String tmpD=output.getParentFile().getAbsolutePath();
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
143
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
144 if(!output.exists())output.createNewFile();
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
145 String command = "sort -k2,2 -k3n,3 -T "+tmpD+" "+rawOutput;
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
146 Process p = rt.exec(command);
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
147 Scanner ps = new Scanner(p.getInputStream());
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
148
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
149 FileWriter fw = new FileWriter(output);
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
150 while(ps.hasNextLine()){
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
151 String nextLine = ps.nextLine();
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
152 fw.write(nextLine+"\n");
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
153 }
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
154 fw.close();
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
155
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
156 Scanner psStdErr=new Scanner(p.getErrorStream());
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
157 while(psStdErr.hasNextLine()){
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
158 String errLine=psStdErr.nextLine();
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
159 System.out.println(errLine);
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
160 }
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
161
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
162 new File(rawOutput).delete();
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
163 new File(pathname).renameTo(new File(rawOutput));
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
164 System.out.println("Reduced file "+new File(rawOutput).getAbsolutePath()+" sorted\n");
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
165
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
166 } catch (IOException e1) {
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
167 e1.printStackTrace();
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
168 }
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
169 }
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
170
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
171
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
172 public String toString(){
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
173 return "ReadFilter";
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
174 }
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
175
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
176
89ad0a9cca52 Uploaded
pfrommolt
parents:
diff changeset
177 }