Snakefile "ymp/rules/megahit.rules"
Env(name="megahit", base="bioconda", packages=[
"megahit",
"pigz",
"coreutils", # for tac
"sed"
])
with Stage("assemble_megahit") as S:
S.doc("""
Assemble metagenome using MegaHit.
>>> ymp make toy.assemble_megahit.map_bbmap
>>> ymp make toy.group_ALL.assemble_megahit.map_bbmap
>>> ymp make toy.group_Subject.assemble_megahit.map_bbmap
""")
rule megahit:
"""
Runs MegaHit.
"""
message:
"(Co-)Assembling {wildcards.target} with megahit"
input:
r1 = "{:prev:}/{:target:}.{:pairnames[0]:}.fq.gz",
r2 = "{:prev:}/{:target:}.{:pairnames[1]:}.fq.gz",
tmpdir = ancient(icfg.dir.tmp)
output:
fasta = "{:this:}/{target}.fasta.gz",
fastg = "{:this:}/{target}.fastg.gz"
log:
log = "{:this:}/{target}.log.gz",
out = "{:this:}/{target}.out.gz",
params:
workdir = "{:this:}/{target}",
preset = "meta-sensitive",
r1 = lambda wc, input: ",".join([input.r1]
if isinstance(input.r1, str) else input.r1),
r2 = lambda wc, input: ",".join([input.r2]
if isinstance(input.r2, str) else input.r2),
mem = icfg.mem("500g")
threads:
32
conda:
"megahit"
shell: r"""
# iff there is an existing options.json with an existing megahit temp dir,
# then we can continue an existing run
MHTMP=$(sed -n '/MEGAHIT_TEMP_DIR:/ s/.*:\s*//p' "{params.workdir}/options.json" 2>/dev/null||true)
CONTINUE=""
if [ -e "$MHTMP" ]; then
echo "YMP: Trying to restart aborted assembly"
CONTINUE="--continue"
else
rm -rf "{params.workdir}"
fi
megahit \
-1 {params.r1} -2 {params.r2} \
--presets {params.preset} \
--num-cpu-threads {threads} \
--out-dir {params.workdir} \
--tmp-dir {input.tmpdir} \
$CONTINUE 2> {log.out}
# abort if not finished
[ -e {params.workdir}/done ] || exit 1
# output zipped contigs
pigz -p {threads} -9 -c {params.workdir}/final.contigs.fa > {output.fasta}
# output the zipped log
cat {params.workdir}/{{options.json,log}} |\
pigz -p {threads} -9 -c > {log.log}
# determine largest K used
MAXK=$(ls {params.workdir}/intermediate_contigs -1 |\
sed -n 's%^k\([0-9]*\).*%\1%p' | sort -nr | head -n1)
# output the zipped fastg
megahit_toolkit contig2fastg $MAXK \
{params.workdir}/intermediate_contigs/k${{MAXK}}.contigs.fa |\
pigz -p {threads} -9 -c > {output.fastg}
# remove intermediate contigs
rm -rf {params.workdir}
"""
rule megahit_all:
message:
"Completed Megahit assemblies"
input:
"{:this:}/{:targets:}.fasta.gz"
output:
"{:this:}/all_targets.stamp"
shell:
"touch {output}"