Snakefile "ymp/rules/spades.rules"

Env(name="spades", base="bioconda", packages=["spades", "pigz"])

with Stage("assemble_metaspades") as S:
    S.doc("""Assemble reads using metaspades

    >>> ymp make toy.assemble_metaspades
    >>> ymp make toy.group_ALL.assemble_metaspades
    >>> ymp make toy.group_Subject.assemble_metaspades
    """)

    localrules: metaspades_input_yaml
    rule metaspades_input_yaml:
        """
        Prepares a dataset config for spades. Spades commandline is limited to
        at most 9 pairs of fq files, so to allow arbitrary numbers we need to
        use the dataset config option.

        Preparing in a separate rule so that the main metaspades rule can use
        the ``shell:`` rule and not ``run:``, which would preclude it from
        using conda environments.
        """
        message:
            "Preparing Config YAML for MetaSpades: {output}"
        input:
            r1 = "{:prev:}/{:target:}.{:pairnames[0]:}.fq.gz",
            r2 = "{:prev:}/{:target:}.{:pairnames[1]:}.fq.gz"
        output:
            yaml = "{:this:}/{target}.yaml"
        run:
            import yaml
            from ymp.util import filter_out_empty

            r1, r2 = filter_out_empty(input.r1, input.r2)

            with open(output.yaml, "w") as cfg:
                cfg.write(yaml.safe_dump([{
                    "left reads": ["../"+r for r in r1],
                    "right reads": ["../"+r for r in r2],
                    "type": "paired-end",
                    "orientation": "fr"
                }]))

    rule metaspades:
        """
        Runs MetaSpades. Supports reads.by_COLUMN.sp/complete as target for
        by group co-assembly.
        """
        message:
            "(Co-)Assembling {wildcards.target} with MetaSpades"
        input:
            conf    = "{:this:}/{target}.yaml",
            r1 = "{:prev:}/{:target:}.{:pairnames[0]:}.fq.gz",
            r2 = "{:prev:}/{:target:}.{:pairnames[1]:}.fq.gz"
        output:
            fasta   = "{:this:}/{target}.fasta.gz",
            fastg   = "{:this:}/{target}.fastg.gz"
        log:
            console = "{:this:}/{target}.log",
            spades  = "{:this:}/{target}.spades.log.gz"
        benchmark:
            "benchmarks/metaspades/{:this:}/{target}.txt"
        params:
            workdir = "{:this:}/tmp.{target}/",
            tmpdir  = "{:dir.tmp:}",
            mem     = icfg.mem("1000g")
        conda:
            "spades"
        threads:
            24
        shell: """
        CONTINUE=""
        if [ -e "{params.workdir}" ]; then
          if [ -e "{params.workdir}/params.txt" ]; then
            CONTINUE=y
          else
            rm -rf "{params.workdir}"
          fi
        fi

        if [ -n "$CONTINUE" ]; then
          metaspades.py \
            -o {params.workdir} \
            --threads {threads} \
            --tmp-dir {params.tmpdir} \
            --memory $(({params.mem}/1024)) \
            --restart-from last \
            >{log.console} 2>&1
        else
          metaspades.py \
            -o {params.workdir} \
            --threads {threads} \
            --tmp-dir {params.tmpdir} \
            --dataset {input.conf} \
            --memory $(({params.mem}/1024)) \
            >{log.console} 2>&1
        fi

        pigz -p {threads} -9 -c {params.workdir}/scaffolds.fasta > {output.fasta}
        pigz -p {threads} -9 -c {params.workdir}/assembly_graph.fastg > {output.fastg}
        cat {params.workdir}/{{params.txt,spades.log}} | pigz -p {threads} -9 > {log.spades}
        rm -rf {params.workdir}
        """

    rule metaspades_all:
        message:
            "Completed Metaspades assemblies"
        input:
            "{:this:}/{:targets:}.fasta.gz"
        output:
            "{:this:}/all_targets.stamp"
        shell:
            "touch {output}"