Snakefile "ymp/rules/bmtagger.rules"

"""
BMTagger Rules

"""

Env(name="bmtagger", base="bioconda", packages=[
    "bmtagger",
    "blast >=2.7.1"  # make sure we don't get 2.2
])

with Stage("index_bmtagger") as S:
    rule bmtagger_bitmask:
        message:
            "BMTagger: bmtool indexing {input}"
        input:
            "{:prev:}/{:target:}.fasta.gz"
        output:
            "{:this:}/{target}.bitmask"
        log:
            "{:this:}/{target}.bitmask.log"
        threads:
            1
        params:
            tmp = "{:this:}/{target}.tmp.fa",
            wordsize = 18, # 18->8g, 17->2g, 16->512MB
        resources:
            mem = "16g",
        conda:
            "bmtagger"
        shell: """
        gunzip -c {input} > {params.tmp}
        bmtool \
        --fasta-file={params.tmp} \
        --output-file={output} \
        --word-size={params.wordsize} \
        > {log} 2>&1
        rm {params.tmp}
        """
        # --compress fails with small references (segfault in bmfilter)

    rule bmtagger_index:
        message:
            "BMTagger: srprism indexing {input}"
        input:
            "{:prev:}/{:target:}.fasta.gz"
        output:
            "{:this:}/{target}.srprism"
        log:
            "{:this:}/{target}.srprism.log"
        threads: 1
        resources:
            mem = "16g",
        conda:
            "bmtagger"
        shell: """
        srprism mkindex \
        --input {input} \
        --output {output} \
        --memory $(({resources.mem_mb} / 16 * 15)) \
        > {log} 2>&1
        touch {output}
        """


with Stage("filter_bmtagger", "remove_bmtagger") as S:
    S.doc("""
    Filter(-out) contaminant reads using BMTagger

    >>> ymp make toy.ref_phiX.index_bmtagger.remove_bmtagger
    >>> ymp make toy.ref_phiX.index_bmtagger.remove_bmtagger.assemble_megahit
    >>> ymp make toy.ref_phiX.index_bmtagger.filter_bmtagger
    >>> ymp make mpic.ref_phiX.index_bmtagger.remove_bmtagger
    """)

    rule bmtagger_find:
        "Match paired end reads against reference"
        message:
            "BMTagger: matching reads from {input.fq} to {input.srprism}"
        input:
            fq      = "{:prev:}/{:target:}.{:pairnames:}.fq.gz",
            bitmask = "{:prev:}/{:target:}.bitmask",
            srprism = "{:prev:}/{:target:}.srprism",
            tmpdir  = ancient("{:dir.tmp:}")
        output:
            matches = temp("{:this:}/{target}.txt"),
            matchgz =      "{:this:}/{target}.txt.gz"
        log:
                           "{:this:}/{target}.txt.log"
        threads:
            1
        params:
            matearg = "-2 <(gunzip -c {input.fq[1]})"
        resources:
            mem = "16g",
        conda:
            "bmtagger"
        shell: """
        bmtagger.sh \
            -b {input.bitmask} \
            -x {input.srprism} \
            -q 1 \
            -1 <(gunzip -c {input.fq[0]}) \
             {params.matearg} \
            -T {input.tmpdir} \
            -o {output.matches} \
            > {log} 2>&1
        gzip {output.matches} -c > {output.matchgz}
        """

    rule bmtagger_find_se:  # ymp: extends bmtagger_find
        "Match single end reads against reference"
        input:
            fq = ["{:prev:}/{:target:}.{:pairnames[0]:}.fq.gz"]
        params:
            matearg = ""

    rule bmtagger_filter:
        "Filter reads using reference"
        message:
            "BMTagger: filtering {input.fq}"
        input:
            fq      = "{:prev:}/{:target:}.{:pairnames[0]:}.fq.gz",
            matches = "{:this:}/{target}.txt"
        output:
                      "{:this:}/{target}.{:pairnames[0]:}.fq.gz"
        log:
                      "{:this:}/{target}.{:pairnames[0]:}.fq.log"
        threads:
            8
        params:
            mate = 1,
            action = "-keep",
        resources:
            mem = "8g",
        conda:
            "bmtagger"
        shell: """
        extract_fullseq \
            {input.matches} \
            {params.action} \
            -fastq \
            -mate{params.mate} <(gunzip -c {input.fq}) | gzip -c > {output}  2>{log}
        """

    rule bmtagger_filter_revread: # ymp: extends bmtagger_filter
        input:
            fq = "{:prev:}/{:target:}.{:pairnames[1]:}.fq.gz"
        output:
                 "{:this:}/{target}.{:pairnames[1]:}.fq.gz"
        log:
                 "{:this:}/{target}.{:pairnames[1]:}.fq.log"
        params:
            mate = 2,
            ext = "{:pairnames[1]:}"

    rule bmtagger_filter_out: # ymp: extends bmtagger_filter
        "Filter-out reads using reference"
        message:
            "BMTagger: filtering out {input.fq}"
        output:
                      "{:that:}/{target}.{pairsuff}.fq.gz"
        log:
                      "{:that:}/{target}.{pairsuff}.fq.log"
        params:
            action = "-remove"

    rule bmtagger_filter_all:
        message:
            "BMTagge: done"
        output:
            touch("{:this:}/all_targets.stamp")
        input:
            "{:this:}/{:targets:}.{:pairnames:}.fq.gz"

    rule bmtagger_remove_all:
        message:
            "BMTagge: done"
        output:
            touch("{:that:}/all_targets.stamp")
        input:
            "{:that:}/{:targets:}.{:pairnames:}.fq.gz"