Snakefile "ymp/rules/metabat2.rules"

Env(name="metabat2", base="bioconda", packages=[
    "metabat2",
    "pigz"
])

with Stage("bin_metabat2") as S:
    S.doc("""
    Bin metagenome assembly into MAGs

    >>> ymp make mock.assemble_megahit.map_bbmap.sort_bam.bin_metabat2
    >>> ymp make mock.group_ALL.assemble_megahit.map_bbmap.sort_bam.group_ALL.bin_metabat2
    """)

    rule metabat2_depth:
        """
        Generates a depth file from BAM
        """
        message:
            "Summarizing contig depth"
        input:
            bam    = "{:prev:}/{:target:}.sorted.bam"
        output:
            depth  = "{:this:}/{target}.depth.txt",
            paired = "{:this:}/{target}.paired.txt"
        log:
                     "{:this:}/{target}.depth.log"
        threads:
            1
        params:
            min_contig_length = 1000,
            min_contig_depth = 1
        conda:
            "metabat2"
        shell:
            "jgi_summarize_bam_contig_depths"
            " --outputDepth {output.depth}"
            " --pairedContigs {output.paired}"
            " --minContigLength {params.min_contig_length}"
            " --minContigDepth {params.min_contig_depth}"
            " {input.bam}"
            " > {log} 2>&1"

    checkpoint metabat2_bin:
        """
        Bin metagenome with MetaBat2
        """
        message:
            "Binning {wildcards.target} with MetaBat2"
        input:
            depth = "{:this:}/{target}.depth.txt",
            fasta = "{:prev:}/{:target:}.fasta.gz"
        output:
            bins = "{:this:}/{target}.bins",
            fasta = "{:this:}/{target}.{:bin:}.fasta.gz"
        log:
            "{:this:}/{target}.metabat.log"
        threads:
            32
        params:
            prefix = "{:this:}/{target}",

            min_contig_len = 2500,  # decrease if input quality very high
            max_p = 95,             # decrease if input quality very low
            max_edges = 200,        # decrease if input quality very low,
                                    # increase if completeness low
            min_s = 60,             # increase if input quality very low
            min_cls_size = 200000,  # minimum bp per bin
            seed = "123456"
        conda:
            "metabat2"
        shell:
            "exec >{log} 2>&1;"
            "metabat2"
            " --inFile {input.fasta}"
            " --abdFile {input.depth}"
            " --outFile {params.prefix}"
            " --minContig {params.min_contig_len}"
            " --maxP {params.max_p}"
            " --minS {params.min_s}"
            " --maxEdges {params.max_edges}"
            " --minClsSize {params.min_cls_size}"
            " --numThreads {threads}"
            " --seed {params.seed}"
            " --saveCls"
            ";"
            "mv {params.prefix} {params.prefix}.metabat.txt;"
            ""
            "touch {output.bins};"
            "if compgen -G '{params.prefix}*.fa' >/dev/null; then"
            "  echo '{params.prefix}'*.fa | xargs -P4 pigz;"
            "  for bin in {params.prefix}.*.fa.gz; do"
            "    base=${{bin#{params.prefix}.}};"
            "    binid=bin_${{base%.fa.gz}};"
            "    mv $bin {params.prefix}__$binid.fasta.gz;"
            "    echo $binid >> {output.bins};"
            "  done;"
            "fi"