Snakefile "ymp/rules/megahit.rules"

Env(name="megahit", base="bioconda", packages=[
    "megahit",
    "pigz",
    "coreutils",  # for tac
    "sed"
])

with Stage("assemble_megahit") as S:
    S.doc("""
    Assemble metagenome using MegaHit.

    >>> ymp make toy.assemble_megahit.map_bbmap
    >>> ymp make toy.group_ALL.assemble_megahit.map_bbmap
    >>> ymp make toy.group_Subject.assemble_megahit.map_bbmap
    """)
    rule megahit:
        """
        Runs MegaHit. 
        """
        message:
            "(Co-)Assembling {wildcards.target} with megahit"
        input:
            r1 = "{:prev:}/{:target:}.{:pairnames[0]:}.fq.gz",
            r2 = "{:prev:}/{:target:}.{:pairnames[1]:}.fq.gz",
            tmpdir = ancient(icfg.dir.tmp)
        output:
            fasta = "{:this:}/{target}.fasta.gz",
            fastg = "{:this:}/{target}.fastg.gz"
        log:
            log = "{:this:}/{target}.log.gz",
            out = "{:this:}/{target}.out.gz",
        params:
            workdir = "{:this:}/{target}",
            preset  = "meta-sensitive",
            r1 = lambda wc, input: ",".join([input.r1]
                                            if isinstance(input.r1, str) else input.r1),
            r2 = lambda wc, input: ",".join([input.r2]
                                            if isinstance(input.r2, str) else input.r2),
            mem = icfg.mem("500g")
        threads:
            32
        conda:
            "megahit"
        shell: r"""
        # iff there is an existing options.json with an existing megahit temp dir,
        # then we can continue an existing run
        MHTMP=$(sed -n '/MEGAHIT_TEMP_DIR:/ s/.*:\s*//p' "{params.workdir}/options.json" 2>/dev/null||true)
        CONTINUE=""
        if [ -e "$MHTMP" ]; then
            echo "YMP: Trying to restart aborted assembly"
            CONTINUE="--continue"
        else
            rm -rf "{params.workdir}"
        fi

        megahit \
          -1 {params.r1} -2 {params.r2} \
          --presets {params.preset} \
          --num-cpu-threads {threads} \
          --out-dir {params.workdir} \
          --tmp-dir {input.tmpdir} \
          $CONTINUE  2> {log.out}

        # abort if not finished
        [ -e {params.workdir}/done ] || exit 1

        # output zipped contigs
        pigz -p {threads} -9 -c {params.workdir}/final.contigs.fa > {output.fasta}

        # output the zipped log
        cat {params.workdir}/{{options.json,log}} |\
        pigz -p {threads} -9 -c > {log.log}

        # determine largest K used
        MAXK=$(ls {params.workdir}/intermediate_contigs -1 |\
               sed -n 's%^k\([0-9]*\).*%\1%p' | sort -nr | head -n1)

        # output the zipped fastg
        megahit_toolkit contig2fastg $MAXK \
        {params.workdir}/intermediate_contigs/k${{MAXK}}.contigs.fa |\
        pigz -p {threads} -9 -c > {output.fastg}

        # remove intermediate contigs
        rm -rf {params.workdir}
        """

    rule megahit_all:
        message:
            "Completed Megahit assemblies"
        input:
            "{:this:}/{:targets:}.fasta.gz"
        output:
            "{:this:}/all_targets.stamp"
        shell:
            "touch {output}"