- Notifications
You must be signed in to change notification settings - Fork 406
/
Copy pathdiagnostic.py
108 lines (91 loc) · 5.19 KB
/
diagnostic.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
"""
Use nextclade QC to produce a list of sequences to be excluded.
"""
importargparse
importnumpyasnp
importpandasaspd
fromdatetimeimportdatetime, timedelta
defisfloat(value):
try:
float(value)
returnTrue
exceptValueError:
returnFalse
defdatestr_to_ordinal(x, minus_weeks=0):
try:
return (datetime.strptime(x,"%Y-%m-%d") -timedelta(weeks=minus_weeks)).toordinal()
except:
returnnp.nan
defearliest_clade_date(Nextstrain_clade, clade_emergence_dates_filename, window_weeks=2):
clade_dates=pd.read_csv(clade_emergence_dates_filename, index_col="Nextstrain_clade", sep='\t')
try:
returndatestr_to_ordinal(clade_dates.loc[Nextstrain_clade]['first_sequence'], minus_weeks=window_weeks)
except:
returnnp.nan
if__name__=='__main__':
parser=argparse.ArgumentParser(
description="check sequences for anomalies",
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument("--metadata", type=str, required=True, help="metadata")
parser.add_argument("--clade_emergence_dates", type=str, default="defaults/clade_emergence_dates.tsv", help="tsv file with two columns: Nextstrain_clade name and first known sequence for that clade.")
parser.add_argument("--clock-filter-recent", type=float, default=20, help="maximal allowed deviation from the molecular clock")
parser.add_argument("--clock-filter", type=float, default=15, help="maximal allowed deviation from the molecular clock")
parser.add_argument("--snp-clusters", type=int, default=1, help="maximal allowed SNP clusters (as defined by nextclade)")
parser.add_argument("--contamination", type=int, default=7, help="maximal allowed putative contamination (labeled + reversion) mutations as defined by nextclade")
parser.add_argument("--clade-emergence-window", type=int, default=2, help="number of weeks before official emergence of clade at which sequences can safely be excluded")
parser.add_argument("--skip-inputs", type=str, nargs="*", help="names of inputs to skip diagnostics for based on presence of metadata fields named like '{input}' with a value of 'yes'")
parser.add_argument("--output-exclusion-list", type=str, required=True, help="Output to-be-reviewed addition to exclude.txt")
args=parser.parse_args()
metadata=pd.read_csv(args.metadata, sep='\t')
# If any inputs should be skipped for diagnostics, remove their records from
# metadata prior to analysis.
ifargs.skip_inputs:
forinput_nameinargs.skip_inputs:
ifinput_nameinmetadata.columns:
metadata=metadata.loc[metadata[input_name] !="yes"].copy()
check_recency="date_submitted"inmetadata
ifcheck_recency:
recency_cutoff= (datetime.today() -timedelta(weeks=4)).toordinal()
recent_sequences=metadata.date_submitted.apply(lambdax: datestr_to_ordinal(x)>recency_cutoff)
else:
print("Skipping QC steps which rely on submission recency, as metadata is missing 'date_submitted'")
check_clade_dates="Nextstrain_clade"inmetadata
#auto exclude sequences N weeks before their clade emergence
ifcheck_clade_dates:
dates=metadata.date.apply(lambdax: datestr_to_ordinal(x))
clade_dates=metadata.Nextstrain_clade.apply(lambdax: earliest_clade_date(x, args.clade_emergence_dates, window_weeks=args.clade_emergence_window))
else:
print("Skipping QC steps which rely on clade-date combinations, as metadata is missing 'Nextstrain_clade'")
if"clock_deviation"inmetadata.columns:
clock_deviation=np.array([float(x) ifisfloat(x) elsenp.nanforxinmetadata.clock_deviation])
else:
clock_deviation=np.zeros(len(metadata), dtype=bool)
if"reversion_mutations"inmetadata.columns:
reversion_mutations=np.array([float(x) ifisfloat(x) elsenp.nanforxinmetadata.reversion_mutations])
else:
reversion_mutations=np.zeros(len(metadata), dtype=bool)
if"potential_contaminants"inmetadata.columns:
contaminants=np.array([float(x) ifisfloat(x) elsenp.nanforxinmetadata.potential_contaminants])
else:
contaminants=np.zeros(len(metadata), dtype=bool)
if"snp_clusters"inmetadata.columns:
snp_clusters=np.array([float(x) ifisfloat(x) elsenp.nanforxinmetadata.snp_clusters])
else:
snp_clusters=np.zeros(len(metadata), dtype=bool)
to_exclude=np.zeros_like(clock_deviation, dtype=bool)
to_exclude|= (reversion_mutations+contaminants>args.contamination)
ifcheck_recency:
to_exclude|=np.abs(clock_deviation)>args.clock_filter_recent
to_exclude|= (np.abs(clock_deviation)>args.clock_filter)&(~recent_sequences)
else:
to_exclude|=np.abs(clock_deviation)>args.clock_filter
ifcheck_clade_dates:
to_exclude|=dates<clade_dates
to_exclude|=snp_clusters>args.snp_clusters
if"QC_mixed_sites"inmetadata.columns:
to_exclude|=metadata.QC_mixed_sites=='bad'
# write out file with sequences flagged for exclusion
withopen(args.output_exclusion_list, 'w') asexcl:
forsinmetadata.loc[to_exclude,'strain']:
excl.write(f'{s}\n')