Mercurial > repos > nml > biohansel_bionumeric_converter
annotate bionumeric_converter.py @ 0:b000a3130db8 draft
planemo upload commit e5e384ce6c90f595e8d397a7c45ca9c17d4a3e2a
author | nml |
---|---|
date | Mon, 18 Mar 2019 13:15:57 -0400 |
parents | |
children | 07dfb8fd47f4 |
rev | line source |
---|---|
0
b000a3130db8
planemo upload commit e5e384ce6c90f595e8d397a7c45ca9c17d4a3e2a
nml
parents:
diff
changeset
|
1 #!/usr/bin/env python |
b000a3130db8
planemo upload commit e5e384ce6c90f595e8d397a7c45ca9c17d4a3e2a
nml
parents:
diff
changeset
|
2 |
b000a3130db8
planemo upload commit e5e384ce6c90f595e8d397a7c45ca9c17d4a3e2a
nml
parents:
diff
changeset
|
3 # Import dependancies needed |
b000a3130db8
planemo upload commit e5e384ce6c90f595e8d397a7c45ca9c17d4a3e2a
nml
parents:
diff
changeset
|
4 import argparse |
b000a3130db8
planemo upload commit e5e384ce6c90f595e8d397a7c45ca9c17d4a3e2a
nml
parents:
diff
changeset
|
5 |
b000a3130db8
planemo upload commit e5e384ce6c90f595e8d397a7c45ca9c17d4a3e2a
nml
parents:
diff
changeset
|
6 import pandas as pd |
b000a3130db8
planemo upload commit e5e384ce6c90f595e8d397a7c45ca9c17d4a3e2a
nml
parents:
diff
changeset
|
7 |
b000a3130db8
planemo upload commit e5e384ce6c90f595e8d397a7c45ca9c17d4a3e2a
nml
parents:
diff
changeset
|
8 # Define the main function: |
b000a3130db8
planemo upload commit e5e384ce6c90f595e8d397a7c45ca9c17d4a3e2a
nml
parents:
diff
changeset
|
9 |
b000a3130db8
planemo upload commit e5e384ce6c90f595e8d397a7c45ca9c17d4a3e2a
nml
parents:
diff
changeset
|
10 |
b000a3130db8
planemo upload commit e5e384ce6c90f595e8d397a7c45ca9c17d4a3e2a
nml
parents:
diff
changeset
|
11 def main(): |
b000a3130db8
planemo upload commit e5e384ce6c90f595e8d397a7c45ca9c17d4a3e2a
nml
parents:
diff
changeset
|
12 parser = argparse.ArgumentParser() |
b000a3130db8
planemo upload commit e5e384ce6c90f595e8d397a7c45ca9c17d4a3e2a
nml
parents:
diff
changeset
|
13 parser.add_argument( |
b000a3130db8
planemo upload commit e5e384ce6c90f595e8d397a7c45ca9c17d4a3e2a
nml
parents:
diff
changeset
|
14 '-f', |
b000a3130db8
planemo upload commit e5e384ce6c90f595e8d397a7c45ca9c17d4a3e2a
nml
parents:
diff
changeset
|
15 '--filename', |
b000a3130db8
planemo upload commit e5e384ce6c90f595e8d397a7c45ca9c17d4a3e2a
nml
parents:
diff
changeset
|
16 required=True, |
b000a3130db8
planemo upload commit e5e384ce6c90f595e8d397a7c45ca9c17d4a3e2a
nml
parents:
diff
changeset
|
17 help='Specify your tsv input') |
b000a3130db8
planemo upload commit e5e384ce6c90f595e8d397a7c45ca9c17d4a3e2a
nml
parents:
diff
changeset
|
18 parser.add_argument( |
b000a3130db8
planemo upload commit e5e384ce6c90f595e8d397a7c45ca9c17d4a3e2a
nml
parents:
diff
changeset
|
19 '-o', |
b000a3130db8
planemo upload commit e5e384ce6c90f595e8d397a7c45ca9c17d4a3e2a
nml
parents:
diff
changeset
|
20 '--output', |
b000a3130db8
planemo upload commit e5e384ce6c90f595e8d397a7c45ca9c17d4a3e2a
nml
parents:
diff
changeset
|
21 default='output.csv', |
b000a3130db8
planemo upload commit e5e384ce6c90f595e8d397a7c45ca9c17d4a3e2a
nml
parents:
diff
changeset
|
22 help='Specify output name') |
b000a3130db8
planemo upload commit e5e384ce6c90f595e8d397a7c45ca9c17d4a3e2a
nml
parents:
diff
changeset
|
23 args = parser.parse_args() |
b000a3130db8
planemo upload commit e5e384ce6c90f595e8d397a7c45ca9c17d4a3e2a
nml
parents:
diff
changeset
|
24 tsv_file = args.filename |
b000a3130db8
planemo upload commit e5e384ce6c90f595e8d397a7c45ca9c17d4a3e2a
nml
parents:
diff
changeset
|
25 out_name = args.output |
b000a3130db8
planemo upload commit e5e384ce6c90f595e8d397a7c45ca9c17d4a3e2a
nml
parents:
diff
changeset
|
26 |
b000a3130db8
planemo upload commit e5e384ce6c90f595e8d397a7c45ca9c17d4a3e2a
nml
parents:
diff
changeset
|
27 no_comma_tsv = comma_remover(tsv_file) |
b000a3130db8
planemo upload commit e5e384ce6c90f595e8d397a7c45ca9c17d4a3e2a
nml
parents:
diff
changeset
|
28 df = qc_shortener(no_comma_tsv) |
b000a3130db8
planemo upload commit e5e384ce6c90f595e8d397a7c45ca9c17d4a3e2a
nml
parents:
diff
changeset
|
29 df.to_csv(out_name, index=False) |
b000a3130db8
planemo upload commit e5e384ce6c90f595e8d397a7c45ca9c17d4a3e2a
nml
parents:
diff
changeset
|
30 |
b000a3130db8
planemo upload commit e5e384ce6c90f595e8d397a7c45ca9c17d4a3e2a
nml
parents:
diff
changeset
|
31 # Remove comma function: |
b000a3130db8
planemo upload commit e5e384ce6c90f595e8d397a7c45ca9c17d4a3e2a
nml
parents:
diff
changeset
|
32 |
b000a3130db8
planemo upload commit e5e384ce6c90f595e8d397a7c45ca9c17d4a3e2a
nml
parents:
diff
changeset
|
33 |
b000a3130db8
planemo upload commit e5e384ce6c90f595e8d397a7c45ca9c17d4a3e2a
nml
parents:
diff
changeset
|
34 def comma_remover(tsv_file): |
b000a3130db8
planemo upload commit e5e384ce6c90f595e8d397a7c45ca9c17d4a3e2a
nml
parents:
diff
changeset
|
35 # Create a table from the tsv file as an input into the dataframe. |
b000a3130db8
planemo upload commit e5e384ce6c90f595e8d397a7c45ca9c17d4a3e2a
nml
parents:
diff
changeset
|
36 df = pd.read_csv(tsv_file, sep='\t') |
b000a3130db8
planemo upload commit e5e384ce6c90f595e8d397a7c45ca9c17d4a3e2a
nml
parents:
diff
changeset
|
37 # Change all commas to / in the QC message |
b000a3130db8
planemo upload commit e5e384ce6c90f595e8d397a7c45ca9c17d4a3e2a
nml
parents:
diff
changeset
|
38 no_comma_tsv = df.replace(',', '/', regex=True) |
b000a3130db8
planemo upload commit e5e384ce6c90f595e8d397a7c45ca9c17d4a3e2a
nml
parents:
diff
changeset
|
39 return no_comma_tsv |
b000a3130db8
planemo upload commit e5e384ce6c90f595e8d397a7c45ca9c17d4a3e2a
nml
parents:
diff
changeset
|
40 |
b000a3130db8
planemo upload commit e5e384ce6c90f595e8d397a7c45ca9c17d4a3e2a
nml
parents:
diff
changeset
|
41 # Shorten QC results: |
b000a3130db8
planemo upload commit e5e384ce6c90f595e8d397a7c45ca9c17d4a3e2a
nml
parents:
diff
changeset
|
42 |
b000a3130db8
planemo upload commit e5e384ce6c90f595e8d397a7c45ca9c17d4a3e2a
nml
parents:
diff
changeset
|
43 |
b000a3130db8
planemo upload commit e5e384ce6c90f595e8d397a7c45ca9c17d4a3e2a
nml
parents:
diff
changeset
|
44 def qc_shortener(df): |
b000a3130db8
planemo upload commit e5e384ce6c90f595e8d397a7c45ca9c17d4a3e2a
nml
parents:
diff
changeset
|
45 for count in df.index: |
b000a3130db8
planemo upload commit e5e384ce6c90f595e8d397a7c45ca9c17d4a3e2a
nml
parents:
diff
changeset
|
46 message = str(df.at[count, 'qc_message']) |
b000a3130db8
planemo upload commit e5e384ce6c90f595e8d397a7c45ca9c17d4a3e2a
nml
parents:
diff
changeset
|
47 if len(message) > 150: |
b000a3130db8
planemo upload commit e5e384ce6c90f595e8d397a7c45ca9c17d4a3e2a
nml
parents:
diff
changeset
|
48 results = message.find('|') |
b000a3130db8
planemo upload commit e5e384ce6c90f595e8d397a7c45ca9c17d4a3e2a
nml
parents:
diff
changeset
|
49 new_message = "Truncated after first '|' : " + message[0:results] |
b000a3130db8
planemo upload commit e5e384ce6c90f595e8d397a7c45ca9c17d4a3e2a
nml
parents:
diff
changeset
|
50 df['qc_message'] = df['qc_message'].replace(message, new_message) |
b000a3130db8
planemo upload commit e5e384ce6c90f595e8d397a7c45ca9c17d4a3e2a
nml
parents:
diff
changeset
|
51 return df |
b000a3130db8
planemo upload commit e5e384ce6c90f595e8d397a7c45ca9c17d4a3e2a
nml
parents:
diff
changeset
|
52 |
b000a3130db8
planemo upload commit e5e384ce6c90f595e8d397a7c45ca9c17d4a3e2a
nml
parents:
diff
changeset
|
53 |
b000a3130db8
planemo upload commit e5e384ce6c90f595e8d397a7c45ca9c17d4a3e2a
nml
parents:
diff
changeset
|
54 if __name__ == '__main__': |
b000a3130db8
planemo upload commit e5e384ce6c90f595e8d397a7c45ca9c17d4a3e2a
nml
parents:
diff
changeset
|
55 main() |