Snakefile "ymp/rules/bbmap.rules"

"""
Rules using tools from the BBTools / BBMap suite by Brian Bushnell
"""

Env(name="bbmap", base="bioconda", packages=[
    "bbmap",
    "pigz",
    "pbgzip",
    "htslib",
    "samtools",
    "coreutils"  # for readlink -f
])

# TODO: find way to enable JNI

bbstats  = "bhist qhist aqhist bqhist lhist ihist ehist qahist "
bbstats += "indelhist mhist gchist idhist statsfile"
bbstats  = bbstats.split()

bbduk_stats = "bhist qhist qchist aqhist bqhist lhist gchist".split()


with Stage("index_bbmap") as S:
    S.doc("""
    >>> ymp make toy.ref_genome.index_bbmap
    """)
    rule bbmap_makedb:
        """
        Precomputes BBMap index
        """
        message:
            "BBMap: Indexing {input}"
        input:
            "{:prev:}/{target}.fasta.gz"
        output:
            directory("{:this:}/{target}")
        log:
            "{:this:}/{target}.log"
        params:
            mem=icfg.mem("80g")
        benchmark:
            "benchmarks/bbmap_makedb/{:this:}/{target}.txt"
        threads: 8
        conda:
            "bbmap"
        shell: """
        bbmap.sh \
            path={output} \
            ref={input} \
            threads={threads} \
            pigz unpigz \
            -Xmx{params.mem}m \
            >{log} 2>&1
        """

    rule bbmap_makedb_all:
        message:
            "BBMap: indexing done"
        output:
            touch("{:this:}/all_targets.stamp")
        input:
            directory("{:this:}/{:targets:}")


with Stage("correct_bbmap") as S:
    S.doc("""
    Correct read errors by overlapping inside tails

    Applies `BBMap's <bioconda:bbmap>` "bbmerge.sh ecco" mode. This will overlap the inside of
    read pairs and choose the base with the higher quality where the alignment
    contains mismatches and increase the quality score as indicated by the double
    observation where the alignment contains matches.

    >>> ymp make toy.correct_bbmap
    >>> ymp make mpic.correct_bbmap
    """)
    rule bbmap_error_correction:
        """Error correction with BBMerge overlapping"""
        message: "BBMap: applying error correction to {input[0]}"
        input:
            "{:prev:}/{:target:}.{:pairnames:}.fq.gz"
        output:
            "{:this:}/{target}.{:pairnames:}.fq.gz",
            adapter = "{:this:}/{target}.adapter.fq"
        log:
            "{:this:}/{target}.log"
        threads: 16
        benchmark:
            "benchmarks/bbmap_ecco/{:this:}/{target}.txt"
        params:
            inout  = "in={input[0]} out={output[0]}",
            inout2 = "in2={input[1]} out2={output[1]}",
            mem    = icfg.mem("80g")
        conda:
            "bbmap"
        shell: """
        bbmerge.sh {params.inout} {params.inout2} \
                   outadapter={output.adapter} \
                   ecco ecctadpole mix vstrict\
                   threads={threads} -Xmx{params.mem}m \
                   > {log} 2>&1
        """

    # FIXME: Is this applicable?
    rule bbmap_error_correction_se:  # ymp: extends bbmap_error_correction
        input:
            ["{:prev:}/{target}.{:pairnames[0]:}.fq.gz"],
        output:
            ["{:this:}/{target}.{:pairnames[0]:}.fq.gz"],
        params:
            inout2 = ""

    rule bbmap_error_correction_all:
        message:
            "BBMap: error correction complete"
        output:
            touch("{:this:}/all_targets.stamp")
        input:
            "{:this:}/{:fq_names:}.fq.gz"


with Stage("trim_bbmap") as S:
    S.doc("""
    Trim adapters and low quality bases from reads

    Applies BBMap's "bbduk.sh".

    Parameters:
      A: append to enable adapter trimming
      Q20: append to select phred score cutoff (default 20)
      L20: append to select minimum read length (default 20)

    >>> ymp make toy.trim_bbmap
    >>> ymp make toy.trim_bbmapA
    >>> ymp make toy.trim_bbmapAQ10L10
    >>> ymp make mpic.trim_bbmap
    """)
    S.add_param("A", typ="flag", name="adapt", value="ref=$BB_RSRC/adapters.fa ")
    S.add_param("Q", typ="int", name="qual", default=20)
    S.add_param("L", typ="int", name="length", default=20)
    S.add_param("E", typ="int", name="entropy", default=0)
    rule bbmap_trim:
        """Trimming and Adapter Removal using BBTools BBDuk"""
        message:
            "BBMap: Trimming {input[0]} "
        input:
            "{:prev:}/{target}.{:pairnames:}.fq.gz"
        output:
            temp("{:this:}/{target}.{:pairnames[0]:}.fq.gz"),
            temp("{:this:}/{target}.{:pairnames[1]:}.fq.gz")
        log:
            "{:this:}/{target}.log"
        benchmark:
            "benchmarks/bbmap_trim/{:this:}/{target}.txt"
        params:
            k      = 23,
            mink   = 11,
            hdist  = 1,
            mem    = icfg.mem("80g"),
            entropy_window = 50,
            entropy_k = 5,
            flags  = "pigz unpigz",
            inout  = "in={input[0]} out={output[0]}",
            inout2 = "in2={input[1]} out2={output[1]}" # overriden by child rule
        threads: 16
        conda:
            "bbmap"
        shell:
            # find adapter dir:
            'BB_RSRC="$(dirname $(readlink -f $(command -v bbduk.sh)))/resources";'
            # run bbduk:
            'bbduk.sh'
            ' {params.inout} {params.inout2}'
            ' trimq={params.qual} qtrim=r'         # quality trimming
            ' minlength={params.length}'           # length filtering
            ' {params.adapt} '                     # adapter trimming
            ' ktrim=r'                             # 3' side only
            ' k={params.k}'                        # k for adapter matching
            ' mink={params.mink}'                  # k at read end
            ' hdist={params.hdist}'                # hamming distance, allow 1 mismatch
            ' tpe' # trimpairsevenly -- in case adapter detected in only one read
            ' tbo' # trimbyoverlap -- trim if read runs over other reads' end
            ' entropy=0.{params.entropy}'
            ' entropywindow={params.entropy_window}'
            ' entropyk={params.entropy_k}'
            ' {params.flags}'                      # processing settings
            ' threads={threads}'
            ' -Xmx{params.mem}m'
            ' >{log} 2>&1'

    rule bbmap_trim_se:  # ymp: extends bbmap_trim
        input:
            ["{:prev:}/{target}.{:pairnames[0]:}.fq.gz"]
        output:
            ["{:this:}/{target}.{:pairnames[0]:}.fq.gz"]
        params:
            inout2 = ""

    localrules: bbmap_trim_all
    rule bbmap_trim_all:
        message:
            "BBMap: finished {output}"
        input:
            "{:this:}/{:fq_names:}.fq.gz"
        output:
            touch("{:this:}/all_targets.stamp")


with Stage("dedup_bbmap") as S:
    S.doc("""
    Remove duplicate reads

    Applies BBMap's "dedupe.sh"

    >>> ymp make toy.dedup_bbmap
    >>> ymp make mpic.dedup_bbmap
    """)
    rule bbmap_dedupe:
        """
        Deduplicate reads using BBMap's dedupe.sh
        """
        message:
            "BBTools dedupe'ing {input}"
        input:
            "{:prev:}/{:target:}.{:pairnames:}.fq.gz"
        output:
            "{:this:}/{target}.{:pairnames:}.fq.gz"
        log:
            "{:this:}/{target}.log"
        benchmark:
            "benchmarks/bbmap_dedupe/{:this:}/{target}.txt"
        params:
            mem = icfg.mem("80g")
        threads: 4
        conda:
            "bbmap"
        shell:
            "dedupe.sh"
            " unpigz"
            " threads={threads}"
            " in={input[0]}"
            " in2={input[1]}"
            " out=stdout"
            " -Xmx{params.mem}m "
            " 2>{log}"
            " |"
            " paste - - - -  - - - - | "
            " tee >(cut -f 1-4 | tr \"\t\" \"\\n\" | pigz -p {threads} > {output[0]}) | "
            " cut -f 5-8 | tr \"\t\" \"\\n\" | "
            " pigz -p {threads} > {output[1]}"


    rule bbmap_dedupe_se:
        """
        Deduplicate reads using BBMap's dedupe.sh
        """
        message:
            "BBTools dedupe'ing {input}"
        input:
            "{:prev:}/{:target:}.{:pairnames[0]:}.fq.gz"
        output:
            "{:this:}/{target}.{:pairnames[0]:}.fq.gz"
        log:
            "{:this:}/{target}.log"
        params:
            mem = icfg.mem("80g")
        threads: 4
        conda:
            "bbmap"
        shell:
            "dedupe.sh"
            " unpigz"
            " threads={threads}"
            " in={input[0]}"
            " out=stdout"
            " -Xmx{params.mem}m "
            " 2>{log}"
            " |"
            " pigz -p {threads} > {output[0]}"

    rule bbmap_dedupe_all:
        message:
            "BBMap: dedupe done"
        output:
            touch("{:this:}/all_targets.stamp")
        input:
            "{:this:}/{:fq_names:}.fq.gz"


ruleorder: bbmap_dedupe > bbmap_dedupe_se


with Stage("remove_bbmap", "filter_bbmap") as S:
    S.doc("""
    Filter reads by reference

    This stage aligns the reads with a given reference using BBMap in fast mode.
    Matching reads are collected in the stage *filter_bbmap* and remaining reads
    are collectec in the stage *remove_bbmap*.

    >>> ymp make toy.ref_phiX.index_bbmap.remove_bbmap
    >>> ymp make toy.ref_phiX.index_bbmap.filter_bbmap
    >>> ymp make mpic.ref_phiX.index_bbmap.remove_bbmap
    """)
    rule bbmap_split:
        message:
            "BBMap filtering - {input[0]}"
        input:
            fq    = "{:prev:}/{target}.{:pairnames:}.fq.gz",
            ref   = directory("{:prev:}/{:target:}")
        output:
            clean = "{:this:}/{target}.{:pairnames:}.fq.gz",
            human = "{:that:}/{target}.{:pairnames:}.fq.gz",
            stats = expand("{{:this:}}/{{target}}.{x}", x=bbstats)
        log:
                    "{:this:}/{target}.log"
        params:
            stats    = lambda wc, output: ["{}={}".format(x,y) for x,y in zip(bbstats, output.stats)],
            minid    = 0.95,
            maxindel = 3,
            bwr      = 0.16,
            bw       = 12,
            trimq    = 10,
            qtrim    = "rl",
            flags    = "quickmatch fast untrim machineout",
            minhits  = 2,
            mem      = icfg.mem("23g"),
            inout2   = "in2={input.fq[1]} outu2={output.clean[1]} outm2={output.human[1]}"
        threads:
            16
        conda:
            "bbmap"
        shell:
            "bbmap.sh "
            " minid={params.minid} "
            " maxindel={params.maxindel} "
            " bwr={params.bwr} "
            " bw={params.bw} "
            " {params.flags} "
            " minhits={params.minhits} "
            " path={input.ref} "
            " qtrim={params.qtrim} "
            " trimq={params.trimq} "
            " -Xmx{params.mem}m "
            " in={input.fq[0]} "
            " outu={output.clean[0]} "
            " outm={output.human[0]} "
            " {params.inout2} "
            " threads={threads} "
            " {params.stats} "
            " > {log} 2>&1"

    rule bbmap_split_se:  # ymp: extends bbmap_split
        input:
            fq    = ["{:prev:}/{target}.{:pairnames[0]:}.fq.gz"]
        output:
            clean = ["{:this:}/{target}.{:pairnames[0]:}.fq.gz"],
            human = ["{:that:}/{target}.{:pairnames[0]:}.fq.gz"]
        params:
            inout2 = ""

    rule bbmap_split_all:
        message:
            "BBMap: split complete"
        output:
            touch("{:this:}/all_targets.stamp")
        input:
            "{:this:}/{:fq_names:}.fq.gz"

    rule bbmap_split_all_remove:
        message:
            "BBMap: split complete"
        output:
            touch("{:that:}/all_targets.stamp")
        input:
            "{:that:}/{:fq_names:}.fq.gz"


###
### Primer Filtering
###

with Stage("primermatch_bbmap", "primerfail_bbmap") as S:
    S.doc("""
    Filters reads by matching reference primer

    >>> ymp make mpic.ref_primers.primermatch_bbmap
    """)

    rule bbduk_primer:
        """
        Splits reads based on primer matching into "primermatch" and "primerfail".
        """
        message:
            "BBduk: Filtering {wildcards.target} for primer set {input.primer}"
        input:
            fq     = "{:prev:}/{:target:}.{:pairnames:}.fq.gz",
            primer = "{:prev:}/{:target:}.fasta.gz"
        output:
            match  = "{:this:}/{target}.{:pairnames:}.fq.gz.xx",
            fail   = "{:that:}/{target}.{:pairnames:}.fq.gz",
            stats  = expand("{{:this:}}/{{target}}.{x}", x=bbduk_stats)
        log:
            "{:this:}/{target}.log"
        threads: 8
        params:
            stats   = lambda wc, output: ["{}={}".format(x,y) for x,y in zip(bbstats, output.stats)],
            mem     = icfg.mem("80g"),
            k       = 12,
            rl      = 12,
            inout2  = "in2={input.fq[1]} outm2={output.match[1]} outu2={output.fail[1]}"
        conda: "bbmap"
        shell:
            'bbduk.sh'
            ' in={input.fq[0]} outm={output.match[0]} outu={output.fail[0]} '
            ' {params.inout2} '
            ' ref={input.primer}'
            ' k={params.k}'               # match using k-mers
            ' restrictleft={params.rl} '  # only match leftmost n bases
            ' maskmiddle=f'               # don't mask middle base in kmer
            ' rcomp=f'                    # don't check reverse complement
            ' copyundefined=t'            # expand wobbles in input
            ' removeifeitherbad=f'        # "bad" is "match", we want both to match
            ' pigz unpigz'
            ' {params.stats}'
            ' -Xmx{params.mem}m'
            ' >{log} 2>&1'

    rule bbduk_primer_se:  # ymp: extends bbduk_primer
        input:
            fq = ["{:prev:}/{:target:}.{:pairnames[0]:}.fq.gz"]
        output:
            match = ["{:this:}/{target}.{:pairnames[0]:}.fq.gz"],
            fail  = ["{:that:}/{target}.{:pairnames[0]:}.fq.gz"]
        params:
            inout2 = ""

    rule bbduk_primer_all:
        message:
            "BBMap: primer filter done"
        output:
            touch("{:this:}/all_targets.stamp")
        input:
            "{:this:}/{:fq_names:}.fq.gz"


with Stage("map_bbmap") as S:
    S.doc("""
    Map reads using BBMap

    >>> ymp make toy.assemble_megahit.map_bbmap
    >>> ymp make toy.ref_genome.map_bbmap
    >>> ymp make mpic.ref_ssu.map_bbmap
    """)
    S.add_param("", typ="choice", name="sensitivity",
                value=['F', 'N', 'S', 'VS'], default='N')
    rule bbmap_map:
        """Map read from each (co-)assembly read file to the assembly"""
        message:
            "BBMap mapping {input.fq[0]} to {input.fa}"
        input:
            fq = "{:prev:}/{:target:}.{:pairnames:}.fq.gz",
            fa = "{:prev:}/{:target:}.fasta.gz",
        output:
            bam   = temp("{:this:}/{target}.bam"),
            stats =      "{:this:}/{target}.stats",
            ihist =      "{:this:}/{target}.ihist"
        log:
                         "{:this:}/{target}.log"
        benchmark:
            "benchmarks/bbmap_map/{:this:}/{target}.txt"
        params:
            mem = icfg.mem("80g"),
            in2 = "in2={input.fq[1]}"
        threads:
            8
        conda:
            "bbmap"
        shell:
            "case {params.sensitivity} in"
            "  N) SENS='';; F) SENS='fast';; S) SENS='slow';; VS) SENS='vslow';;"
            "esac;"
            "bbmap.sh"
            " $SENS"
            " threads={threads}"
            " pigz unpigz"          # enable use of pigz for zip/unzip
            # this is disabled because not compiled in conda:
            # " jni"                # use JNI for C alignment algo
            " nodisk "              # don't write index to disk
            " ref={input.fa}"       # reference
            " in={input.fq[0]} {params.in2}"
            " out=stdout"           # write sam to stdout
            " machineout statsfile={output.stats}" # parseable stats
            " ihist={output.ihist}" # insert histogram
            " ambiguous=all"        # output all matches
            " mdtag"                # make source recoverable in sam
            " trimreaddescriptions" # use header until space like bowtie2
            " -Xmx{params.mem}m"     # set JVM heap size
            " 2>{log}"              # log error
            " | samtools view -b -o {output.bam} -" # convert to bam

    rule bbmap_map_SE: # ymp: extends bbmap_map
        input:
            fq = ["{:prev:}/{:target:}.{:pairnames[0]:}.fq.gz"]
        params:
            in2 = ""

    rule bbmap_map_all:
        message:
            "BBMap: mapping done"
        output:
            touch("{:this:}/all_targets.stamp")
        input:
            "{:this:}/{:targets:}.bam"


with Stage("dust_bbmap") as S:
    S.doc("""
    Perform entropy filtering on reads using BBMap's bbduk.sh

    The parameter ``Enn`` gives the entropy cutoff. Higher values
    filter more sequences.

    >>> ymp make toy.dust_bbmap
    >>> ymp make toy.dust_bbmapE60
    """)
    S.add_param("E", typ="int", name="entropy", default=10)

    rule bbmap_dust:
        message:
            "BBMap: removing low entropy reads"
        input:
            fq = "{:prev:}/{target}.{:pairnames:}.fq.gz"
        output:
            fq = "{:this:}/{target}.{:pairnames:}.fq.gz"
        log:
                 "{:this:}/{target}.log"
        benchmark:
            "benchmarks/bbmap_dust/{:this:}/{target}.txt"
        threads: 8
        conda: "bbmap"
        params:
            entropy_window = 50,
            entropy_k = 5,
            inout2 = "in2={input.fq[1]} out2={output.fq[1]}",
            mem = icfg.mem("80g")
        shell:
            'bbduk.sh'
            ' in={input.fq[0]} out={output.fq[0]}'
            ' {params.inout2}'
            ' entropy=0.{params.entropy}'
            ' entropywindow={params.entropy_window}'
            ' entropyk={params.entropy_k}'
            ' pigz unpigz'
            ' -Xmx{params.mem}m'
            ' >{log} 2>&1'

    rule bbmap_dust_all:
        message:
            "BBMap: fishished removing low entropy reads"
        output:
            touch("{:this:}/all_targets.stamp")
        input:
            "{:this:}/{:targets:}.{:pairnames:}.fq.gz"


with Stage("format_bbmap") as S:
    S.doc("""
    Process sequences with BBMap's format.sh

    Parameter ``Ln`` filters sequences at a minimum length.

    >>> ymp make toy.assemble_metaspades.format_bbmapL200
    """)
    S.add_param("L", typ="int", name="length", default=1)
    rule bbmap_reformat:
        message:
            "BBMap: reformatting fasta"
        input:
            fasta = "{:prev:}/{target}.fasta.gz"
        output:
            fasta = "{:this:}/{target}.fasta.gz"
        log:
                    "{:this:}/{target}.log"
        threads:
            4
        conda:
            "bbmap"
        shell:
            'reformat.sh'
            ' in={input.fasta}'
            ' out={output.fasta}'
            ' >{log} 2>&1'
            ' bgzip'
            ' fastawrap=0'  # sequence on one line
            ' fastaminlen={params.length}'

    rule bbmap_reformat_all:
        message:
            "BBMap: completed {output}"
        input:
            "{:this:}/{:targets:}.fasta.gz"
        output:
            touch("{:this:}/all_targets.stamp")