Snakefile "ymp/rules/spades.rules"
from ymp.util import check_input, filter_input
Env(name="spades", base="bioconda", packages=["spades", "pigz"])
with Stage("assemble_spades") as S:
S.doc("""Assemble reads using spades
>>> ymp make toy.assemble_spades
>>> ymp make toy.group_ALL.assemble_spades
>>> ymp make toy.group_Subject.assemble_spades
>>> ymp make toy.assemble_spades
>>> ymp make toy.assemble_spadesMeta
>>> ymp make toy.assemble_spadesSc
>>> ymp make toy.assemble_spadesRna
>>> ymp make toy.assemble_spadesIsolate
>>> ymp make toy.assemble_spadesNC
>>> ymp make toy.assemble_spadesMetaNC
""")
S.add_param("", typ="choice", name="task",
value=['Normal', 'Meta', 'Sc', 'Rna', 'Isolate',
'Plasmid', 'Metaviral', 'Metaplasmid', 'Rnaviral'],
default="Normal")
S.add_param("NC", typ="flag", name="not_careful")
localrules: spades_input_yaml
rule spades_input_yaml:
"""
Prepares a dataset config for spades. Spades commandline is limited to
at most 9 pairs of fq files, so to allow arbitrary numbers we need to
use the dataset config option.
Preparing in a separate rule so that the main spades rule can use
the ``shell:`` rule and not ``run:``, which would preclude it from
using conda environments.
"""
message:
"{:name:}: {output}"
input:
r1 = "{:prev:}/{:target:}.{:pairnames[0]:}.fq.gz",
r2 = "{:prev:}/{:target:}.{:pairnames[1]:}.fq.gz"
params:
r1 = filter_input("r1", also=["r2"], minsize=1),
r2 = filter_input("r2", also=["r1"], minsize=1)
output:
yaml = "{:this:}/{target}.yaml"
shell: """
if [ -z "{params.r1}" ]; then
echo > {output.yaml}
exit 0
fi
echo "- left reads:" >> {output.yaml}
for n in {params.r1}; do
echo " - ../$n" >> {output.yaml}
done
echo " right reads:" >> {output.yaml}
for n in {params.r2}; do
echo " - ../$n" >> {output.yaml}
done
echo " type: paired-end" >> {output.yaml}
echo " orientation: fr" >> {output.yaml}
"""
rule spades:
"""
Runs Spades. Supports reads.by_COLUMN.sp/complete as target for
by group co-assembly.
"""
message:
"{:name:}: {output.scaffolds}"
input:
conf = "{:this:}/{target}.yaml",
r1 = "{:prev:}/{:target:}.{:pairnames[0]:}.fq.gz",
r2 = "{:prev:}/{:target:}.{:pairnames[1]:}.fq.gz"
output:
contigs = "{:this:}/{target}.contigs.fasta.gz",
scaffolds = "{:this:}/{target}.fasta.gz",
graph = "{:this:}/{target}.fastg.gz"
log:
console = "{:this:}/{target}.log",
spades = "{:this:}/{target}.spades.log.gz"
benchmark:
"benchmarks/spades/{:this:}/{target}.txt"
params:
workdir = "{:this:}/{target}.tmp/",
tmpdir = "{:dir.tmp:}",
#low_read_count = 50,
# Allow failure if we have <50 sequences
sufficient_input = check_input(["r1", "r2"], minlines=50*4*2)
resources:
mem = "1T",
conda:
"spades"
threads:
128
shell: r"""
OUTPUT="scaffolds.fasta"
case {params.task} in
Normal) TASK="" ;;
Meta) TASK="--meta" ;;
Sc) TASK="--sc" ;;
Rna) TASK="--rna"; OUTPUT=transcripts.fasta ;;
Isolate) TASK="--isolate" ;;
Plasmid) TASK="--plasmid" ;;
Metaviral) TASK="--metaviral" ;;
Metaplasmid) TASK="--metaplasmid" ;;
Rnaviral) TASK="--rnaviral" ;;
esac
if [ -z "{params.not_careful}" ]; then
case {params.task} in
Meta|Rna|Isolate|MetaViral|Metaplasmid|Rnaviral) : ;;
*) TASK="$TASK --careful" ;;
esac
fi
CONTINUE=""
if [ -e "{params.workdir}" ]; then
if [ -e "{params.workdir}/params.txt" ]; then
CONTINUE=y
else
rm -rf "{params.workdir}"
fi
fi
make_empty_result() {{
mkdir -p "{params.workdir}"
touch "{params.workdir}/$OUTPUT"
touch "{params.workdir}/contigs.fasta"
touch "{params.workdir}/assembly_graph.fastg"
touch "{params.workdir}/params.txt"
touch "{params.workdir}/spades.log"
echo "YMP: $*" >> "{log.console}"
}}
if [ -e "{input.conf}" -a \! -s "{input.conf}" ]; then
make_empty_result "skipped, empty input"
else
ARGS=""
ARGS="$ARGS -o {params.workdir}"
ARGS="$ARGS --threads {threads}"
ARGS="$ARGS --tmp-dir {params.tmpdir}"
# 98% available minus 2GB
ARGS="$ARGS --memory $(({resources.mem_gb} * 98 / 100 - 2))"
if [ -n "$CONTINUE" ]; then
ARGS="$ARGS --restart-from last"
else
ARGS="$TASK $ARGS --dataset {input.conf}"
fi
echo "YMP: Running spades.py $ARGS" >>{log.console}
if ! spades.py $ARGS >>{log.console} 2>&1; then
if [ "{params.sufficient_input}" = "False" ]; then
make_empty_result "Assuming failure due to low read count; skipping"
elif [ ! -e {params.workdir}spades.log ]; then
exit 1 # no log file?
elif grep -q "index_.size() != 0' failed." {params.workdir}spades.log; then
make_empty_result "Spades index assertion failure. Usually due to bad input. Skipping"
else
exit 1
fi
fi
fi
if [ ! -e {params.workdir}$OUTPUT ]; then
if [ -e {params.workdir}contigs.fasta ]; then
echo "YMP: WARNING Spades produced no $OUTPUT file, using contigs.fasta instead!" >> {log.console}
OUTPUT=contigs.fasta
fi
fi
pigz -p {threads} -9 -c {params.workdir}$OUTPUT > {output.scaffolds}
pigz -p {threads} -9 -c {params.workdir}assembly_graph.fastg > {output.graph}
if [ -e {params.workdir}/contigs.fasta ]; then
pigz -p {threads} -9 -c {params.workdir}contigs.fasta > {output.contigs}
else
cp {output.scaffolds} {output.contigs}
fi
cat {params.workdir}{{params.txt,spades.log}} | pigz -p {threads} -9 > {log.spades}
rm -rf {params.workdir}
"""