Snakefile "ymp/rules/spades.rules"

from ymp.util import check_input, filter_input

Env(name="spades", base="bioconda", packages=["spades", "pigz"])

with Stage("assemble_spades") as S:
    S.doc("""Assemble reads using spades

    >>> ymp make toy.assemble_spades
    >>> ymp make toy.group_ALL.assemble_spades
    >>> ymp make toy.group_Subject.assemble_spades
    >>> ymp make toy.assemble_spades
    >>> ymp make toy.assemble_spadesMeta
    >>> ymp make toy.assemble_spadesSc
    >>> ymp make toy.assemble_spadesRna
    >>> ymp make toy.assemble_spadesIsolate
    >>> ymp make toy.assemble_spadesNC
    >>> ymp make toy.assemble_spadesMetaNC
    """)
    S.add_param("", typ="choice", name="task",
                value=['Normal', 'Meta', 'Sc', 'Rna', 'Isolate',
                       'Plasmid', 'Metaviral', 'Metaplasmid', 'Rnaviral'],
                default="Normal")
    S.add_param("NC", typ="flag", name="not_careful")

    localrules: spades_input_yaml
    rule spades_input_yaml:
        """
        Prepares a dataset config for spades. Spades commandline is limited to
        at most 9 pairs of fq files, so to allow arbitrary numbers we need to
        use the dataset config option.

        Preparing in a separate rule so that the main spades rule can use
        the ``shell:`` rule and not ``run:``, which would preclude it from
        using conda environments.
        """
        message:
            "{:name:}: {output}"
        input:
            r1 = "{:prev:}/{:target:}.{:pairnames[0]:}.fq.gz",
            r2 = "{:prev:}/{:target:}.{:pairnames[1]:}.fq.gz"
        params:
            r1 = filter_input("r1", also=["r2"], minsize=1),
            r2 = filter_input("r2", also=["r1"], minsize=1)
        output:
            yaml = "{:this:}/{target}.yaml"
        shell: """
        if [ -z "{params.r1}" ]; then
          echo > {output.yaml}
          exit 0
        fi
        echo "- left reads:" >> {output.yaml}
        for n in {params.r1}; do
          echo "  - ../$n" >> {output.yaml}
        done
        echo "  right reads:" >> {output.yaml}
        for n in {params.r2}; do
          echo "  - ../$n" >> {output.yaml}
        done
        echo "  type: paired-end" >> {output.yaml}
        echo "  orientation: fr" >> {output.yaml}
        """

    rule spades:
        """
        Runs Spades. Supports reads.by_COLUMN.sp/complete as target for
        by group co-assembly.
        """
        message:
            "{:name:}: {output.scaffolds}"
        input:
            conf    = "{:this:}/{target}.yaml",
            r1      = "{:prev:}/{:target:}.{:pairnames[0]:}.fq.gz",
            r2      = "{:prev:}/{:target:}.{:pairnames[1]:}.fq.gz"
        output:
            contigs    = "{:this:}/{target}.contigs.fasta.gz",
            scaffolds  = "{:this:}/{target}.fasta.gz",
            graph      = "{:this:}/{target}.fastg.gz"
        log:
            console     = "{:this:}/{target}.log",
            spades      = "{:this:}/{target}.spades.log.gz"
        benchmark:
            "benchmarks/spades/{:this:}/{target}.txt"
        params:
            workdir = "{:this:}/{target}.tmp/",
            tmpdir  = "{:dir.tmp:}",
            #low_read_count = 50,
            # Allow failure if we have <50 sequences
            sufficient_input = check_input(["r1", "r2"], minlines=50*4*2)
        resources:
            mem = "1T",
        conda:
            "spades"
        threads:
            128
        shell: r"""
        OUTPUT="scaffolds.fasta"
        case {params.task} in
          Normal)  TASK="" ;;
          Meta)    TASK="--meta" ;;
          Sc)      TASK="--sc" ;;
          Rna)     TASK="--rna"; OUTPUT=transcripts.fasta ;;
          Isolate) TASK="--isolate" ;;
          Plasmid) TASK="--plasmid" ;;
          Metaviral) TASK="--metaviral" ;;
          Metaplasmid) TASK="--metaplasmid" ;;
          Rnaviral) TASK="--rnaviral" ;;
        esac
        if [ -z "{params.not_careful}" ]; then
          case {params.task} in
            Meta|Rna|Isolate|MetaViral|Metaplasmid|Rnaviral) : ;;
            *) TASK="$TASK --careful" ;;
          esac
        fi
        CONTINUE=""
        if [ -e "{params.workdir}" ]; then
           if [ -e "{params.workdir}/params.txt" ]; then
              CONTINUE=y
           else
              rm -rf "{params.workdir}"
           fi
        fi
        make_empty_result() {{
           mkdir -p "{params.workdir}"
           touch "{params.workdir}/$OUTPUT"
           touch "{params.workdir}/contigs.fasta"
           touch "{params.workdir}/assembly_graph.fastg"
           touch "{params.workdir}/params.txt"
           touch "{params.workdir}/spades.log"
           echo "YMP: $*" >> "{log.console}"
        }}
        if [ -e "{input.conf}" -a \! -s "{input.conf}" ]; then
           make_empty_result "skipped, empty input"
        else
           ARGS=""
           ARGS="$ARGS -o {params.workdir}"
           ARGS="$ARGS --threads {threads}"
           ARGS="$ARGS --tmp-dir {params.tmpdir}"
           # 98% available minus 2GB
           ARGS="$ARGS --memory $(({resources.mem_gb} * 98 / 100 - 2))"
           if [ -n "$CONTINUE" ]; then
              ARGS="$ARGS --restart-from last"
           else
              ARGS="$TASK $ARGS --dataset {input.conf}"
           fi
           echo "YMP: Running spades.py $ARGS" >>{log.console}
           if ! spades.py $ARGS >>{log.console} 2>&1; then
              if [ "{params.sufficient_input}" = "False" ]; then
                 make_empty_result "Assuming failure due to low read count; skipping"
              elif [ ! -e {params.workdir}spades.log ]; then
                 exit 1  # no log file?
              elif grep -q "index_.size() != 0' failed." {params.workdir}spades.log; then
                 make_empty_result "Spades index assertion failure. Usually due to bad input. Skipping"
              else
                 exit 1
              fi
           fi
        fi
        if [ ! -e {params.workdir}$OUTPUT ]; then
          if [ -e {params.workdir}contigs.fasta ]; then
            echo "YMP: WARNING Spades produced no $OUTPUT file, using contigs.fasta instead!" >> {log.console}
            OUTPUT=contigs.fasta
          fi
        fi

        pigz -p {threads} -9 -c {params.workdir}$OUTPUT > {output.scaffolds}
        pigz -p {threads} -9 -c {params.workdir}assembly_graph.fastg > {output.graph}
        if [ -e {params.workdir}/contigs.fasta ]; then
          pigz -p {threads} -9 -c {params.workdir}contigs.fasta > {output.contigs}
        else
          cp {output.scaffolds} {output.contigs}
        fi
        cat {params.workdir}{{params.txt,spades.log}} | pigz -p {threads} -9 > {log.spades}
        rm -rf {params.workdir}
        """