Snakefile "ymp/rules/cdhit.rules"
Env(name="cdhit", base="bioconda", packages="cd-hit")
with Stage("cluster_cdhit") as S:
S.doc("""
Clusters protein sequences using CD-HIT
>>> ymp make toy.ref_query.cluster_cdhit
""")
rule cdhit_prepare_input:
"""Prepares input data for CD-HIT
- rewrites '*' to 'X' as stop-codon not understood by CD-HIT
- prefixes lost ID to Fasta ID
"""
message:
"CDHit: preparing input data - {wildcards.target}"
input:
"{:prev:}/{:target:}.fastp.gz"
output:
temp("{:this:}/combined_{target}.fastp")
run:
import gzip
with open(output[0], "w") as outfasta:
for infile in input:
id = os.path.basename(infile)[:-len(".fastp.gz")]
with gzip.open(infile, "rt") as infasta:
for line in infasta:
if line.startswith(">"):
outfasta.write("".join((">",id,":",line[1:])))
else:
outfasta.write(line.replace("*","X"))
rule cdhit_faa_single:
"""Clustering predicted genes using cdhit"""
message:
"CD-HIT: clustering - {wildcards.target}"
input:
"{:this:}/combined_{target}.fastp"
output:
fa = "{:this:}/{target}.fastp.gz",
clstr = "{:this:}/{target}.clstr"
log:
"{:this:}/{target}.log"
threads:
32
params:
slow=1,
print_overlap=1,
description_length=0
resources:
mem = "32g",
conda:
"cdhit"
shell: """
cd-hit \
-T {threads} \
-M $[1024 * 4 * {threads}] \
-i {input} \
-o {output.fa} \
-g {params.slow} \
-p {params.print_overlap} \
-d {params.description_length} \
> {log} 2>&1
mv {output.fa}.clstr {output.clstr}
"""
rule cdhit_clstr_to_csv:
message:
"CD-HIT: converting clustering output to csv - {wildcards.target}"
input:
"{:this:}/{target}.clstr"
output:
"{:this:}/{target}.clstr.csv"
threads:
1
run:
import re, csv
clstr_format = re.compile(
r"(?P<leaf_id>\d+)\s+(?P<qlen>\d+)aa,\s"
r">(?P<qacc>.*)\.\.\.\s"
r"(at\s(?P<qstart>\d+):(?P<qend>\d+):(?P<sstart>\d+):(?P<send>\d+)/"
r"(?P<pident>\d+\.\d+)%|\*)"
, flags = re.VERBOSE|re.ASCII)
fieldnames = ["cluster_id", "leaf_id",
"sacc", "qacc", "qlen",
"qstart", "qend", "sstart", "send",
"pident"]
with open(input[0], "r") as inf, \
open(output[0], "w") as outf:
writer = csv.DictWriter(outf, fieldnames=fieldnames)
writer.writeheader()
rows=[]
cluster_id = None
sacc = None
for line in inf:
if line[0] == ">":
if len(rows) > 0:
for row in rows:
row['sacc'] = sacc
writer.writerows(rows)
rows=[]
cluster_id = line.split()[1].strip()
else:
d = clstr_format.match(line).groupdict()
d["cluster_id"] = cluster_id
if "qstart" not in d or d["qstart"] is None:
d["qstart"] = 1
d["qend"] = d["qlen"]
d["sstart"] = 1
d["send"] = d["qlen"]
d["pident"] = "100.00"
sacc = d["qacc"]
rows.append(d)
if len(rows) > 0:
for row in rows:
row['sacc'] = sacc
writer.writerows(rows)
rule cdhit_fna_single:
"""Clustering predicted genes (nuc) using cdhit-est"""
message:
"CD-HIT-EST clustering {input} -> {output}"
input:
"{dir}.genes/{nodots}.fna"
output:
"{dir}.genes/{nodots}.NR.fna"
log:
"{dir}.genes/{nodots}.NR.fna.log"
threads:
33
params:
slow=1,
print_overlap=1,
description_length=0,
id=0.95
conda:
"cdhit"
shell: """
cd-hit-est \
-i {input} \
-o {output} \
-c {params.id} \
-M $[{threads} * 1024 * 4] \
-T {threads} \
-g {params.slow} \
-p {params.print_overlap} \
-d {params.description_length} \
> {log} 2>&1
"""