ncov/scripts/assign_clades.py at master · nextstrain/ncov · GitHub

Name: ncov/scripts/assign_clades.py at master · nextstrain/ncov · GitHub
Rating: 4.4 (8709 reviews)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
#!/usr/bin/env python3
"""
Obsolete: script that assigns clades to sequences based on clade designations in `defaults/clades.tsv`
"""
importnumpyasnp
importargparse, sys, os
fromBioimportAlignIO, SeqIO, Seq, SeqRecord
fromBio.AlignIOimportMultipleSeqAlignment
fromaugur.translateimportsafe_translate
fromaugur.alignimportrunasaugur_align
fromaugur.cladesimportread_in_clade_definitions, is_node_in_clade
fromaugur.utilsimportload_features

classalignArgs:
def__init__(self, **kwargs):
self.__dict__.update(kwargs)

classtmpNode(object):
def__init__(self):
self.sequences= {}

if__name__=='__main__':
parser=argparse.ArgumentParser(
description="Assign clades to sequences",
formatter_class=argparse.ArgumentDefaultsHelpFormatter
 )
group=parser.add_mutually_exclusive_group(required=True)
group.add_argument("--sequences", help="*unaligned* FASTA file of SARS-CoV-2sequences")
group.add_argument("--alignment", help="*aligned* FASTA file of SARS-CoV-2 sequences relative to Wuhan-HU-1 with insertions removed")
parser.add_argument("--output", type=str, default='clade_assignment.tsv', help="tsv file to write clade definitions to")
parser.add_argument("--keep-temporary-files", action='store_true', help="don't clean up")
parser.add_argument("--chunk-size", default=10, type=int, help="process this many sequences at once")
parser.add_argument("--nthreads", default=1, type=int, help="Number of threads to use in alignment")
args=parser.parse_args()

refname=f"defaults/reference_seq.gb"
features=load_features(refname)
ifargs.sequences:
seqs=SeqIO.parse(args.sequences, 'fasta')
else:
alignment=SeqIO.parse(args.alignment, 'fasta')

ref=SeqIO.read(refname, 'genbank')
clade_designations=read_in_clade_definitions(f"defaults/clades.tsv")

log_fname="clade_assignment.log"
in_fname="clade_assignment_tmp.fasta"
out_fname="clade_assignment_tmp_alignment.fasta"


output=open(args.output, 'w')
print('name\tclade\tparent clades', file=output)

# break the sequences into chunks, align each to the reference, and assign clades one-by-one
done=False
whilenotdone:
# if not aligned, align
ifargs.sequences:
# generate a chunk with chunk-size sequences
chunk= []
whilelen(chunk)<args.chunk_sizeand (notdone):
try:
seq=seqs.__next__()
chunk.append(seq)
exceptStopIteration:
done=True

print(f"writing {len(chunk)} and the reference sequence to file '{in_fname}' for alignment.")
withopen(in_fname, 'wt') asfh:
SeqIO.write(chunk, fh, 'fasta')

aln_args=alignArgs(sequences=[in_fname], output=out_fname, method='mafft',
reference_name=None, reference_sequence=refname,
nthreads=args.nthreads, remove_reference=False,
existing_alignment=False, debug=False, fill_gaps=False)

augur_align(aln_args)
alignment=AlignIO.read(out_fname, 'fasta')
else:
done=True

forseqinalignment:
ifseq.id==ref.id:
continue
iflen(seq.seq)!=len(ref.seq):
importipdb; ipdb.set_trace()
print(f"ERROR: this file doesn't seem aligned to the reference. {seq.id} as length {len(seq.seq)} while the reference has length {len(ref.seq)}.")
sys.exit(1)

# read sequence and all its annotated features
seq_container=tmpNode()
seq_str=str(seq.seq)
seq_container.sequences['nuc'] = {i:cfori,cinenumerate(seq_str)}
forfname, featinfeatures.items():
iffeat.type!='source':
seq_container.sequences[fname] = {i:cfori,cinenumerate(safe_translate(feat.extract(seq_str)))}

# for each clade, check whether it matches any of the clade definitions in the tsv
matches= []
forclade_name, clade_allelesinclade_designations.items():
ifis_node_in_clade(clade_alleles, seq_container, ref):
matches.append(clade_name)


# print the last match as clade assignment and all others as ancestral clades
# note that this assumes that clades in the tsv are ordered by order of appearence.
# furthermore, this will only work if parent clades don't have definitions that exclude
# child clades, i.e. positions can only be additive for this to work.
ifmatches:
print(f"{seq.description}\t{matches[-1]}\t{', '.join(matches[:-1])}", file=output)
else:
print(f"{seq.description}\t -- \t", file=output)

output.close()

ifnotargs.keep_temporary_files:
forfnamein [log_fname, in_fname, out_fname]:
ifos.path.isfile(fname): #won't exist if didn't align
os.remove(fname)