Snakefile "ymp/rules/bbmap.rules"

"""
Rules using tools from the BBTools / BBMap suite by Brian Bushnell
"""

Env(name="bbmap", base="bioconda", packages=[
    "bbmap",
    "pigz",
    "pbgzip",
    "htslib",
    "samtools",
    "coreutils"  # for readlink -f
])

bbstats  = "bhist qhist aqhist bqhist lhist ihist ehist qahist "
bbstats += "indelhist mhist gchist idhist statsfile"
bbstats  = bbstats.split()

bbduk_stats = "bhist qhist qchist aqhist bqhist lhist gchist".split()


with Stage("index_bbmap") as S:
    S.doc("""
    Creates `BBMap <bioconda:bbmap>` index

    >>> ymp make toy.ref_genome.index_bbmap
    """)
    rule bbmap_makedb:
        """
        Precomputes BBMap index
        """
        message:
            "{:name:}: Indexing {input}"
        input:
            "{:prev:}/{target}.fasta.gz"
        output:
            directory("{:this:}/{target}")
        log:
            "{:this:}/{target}.log"
        resources:
            mem = "80g",
        benchmark:
            "benchmarks/{:name:}/{:this:}/{target}.txt"
        threads: 8
        conda:
            "bbmap"
        shell: """
        bbmap.sh \
            path={output} \
            ref={input} \
            threads={threads} \
            pigz unpigz \
            -Xmx{resources.mem_mb}m \
            >{log} 2>&1
        """


with Stage("correct_bbmap") as S:
    S.doc("""
    Correct read errors by overlapping inside tails

    Applies `BBMap's <bioconda:bbmap>` "bbmerge.sh ecco" mode. This will overlap the inside of
    read pairs and choose the base with the higher quality where the alignment
    contains mismatches and increase the quality score as indicated by the double
    observation where the alignment contains matches.

    >>> ymp make toy.correct_bbmap
    >>> ymp make mpic.correct_bbmap
    """)
    rule bbmap_error_correction:
        """Error correction with BBMerge overlapping"""
        message: "{:name:}: applying error correction to {input[0]}"
        input:
            "{:prev:}/{:target:}.{:pairnames:}.fq.gz"
        output:
            "{:this:}/{target}.{:pairnames:}.fq.gz",
            adapter = "{:this:}/{target}.adapter.fq"
        log:
            "{:this:}/{target}.log"
        threads: 16
        benchmark:
            "benchmarks/{:name:}/{:this:}/{target}.txt"
        params:
            inout  = "in={input[0]} out={output[0]}",
            inout2 = "in2={input[1]} out2={output[1]}",
        resources:
            mem = "80g",
        conda:
            "bbmap"
        shell: """
        bbmerge.sh {params.inout} {params.inout2} \
                   outadapter={output.adapter} \
                   ecco ecctadpole mix vstrict\
                   threads={threads} -Xmx{resources.mem_mb}m \
                   > {log} 2>&1
        """

    # FIXME: Is this applicable?
    rule bbmap_error_correction_se:  # ymp: extends bbmap_error_correction
        input:
            ["{:prev:}/{target}.{:pairnames[0]:}.fq.gz"],
        output:
            ["{:this:}/{target}.{:pairnames[0]:}.fq.gz"],
        params:
            inout2 = ""

    rule bbmap_error_correction_all:
        message:
            "BBMap: error correction complete"
        output:
            touch("{:this:}/all_targets.stamp")
        input:
            "{:this:}/{:fq_names:}.fq.gz"


with Stage("trim_bbmap") as S:
    S.doc("""
    Trim adapters and low quality bases from reads

    Applies `BBMap's <bioconda:bbmap>` "bbduk.sh".

    Parameters:
      A: append to enable adapter trimming
      Q20: append to select phred score cutoff (default 20)
      L20: append to select minimum read length (default 20)

    >>> ymp make toy.trim_bbmap
    >>> ymp make toy.trim_bbmapA
    >>> ymp make toy.trim_bbmapAQ10L10
    >>> ymp make mpic.trim_bbmap
    """)
    S.add_param("A", typ="flag", name="adapt", value="ref=$BB_RSRC/adapters.fa ")
    S.add_param("Q", typ="int", name="qual", default=20)
    S.add_param("L", typ="int", name="length", default=20)
    S.add_param("E", typ="int", name="entropy", default=0)
    rule bbmap_trim:
        """Trimming and Adapter Removal using BBTools BBDuk"""
        message:
            "{:name:}: Trimming {input[0]} "
        input:
            "{:prev:}/{target}.{:pairnames:}.fq.gz"
        output:
            temp("{:this:}/{target}.{:pairnames[0]:}.fq.gz"),
            temp("{:this:}/{target}.{:pairnames[1]:}.fq.gz")
        log:
            "{:this:}/{target}.log"
        benchmark:
            "benchmarks/{:name:}/{:this:}/{target}.txt"
        params:
            k      = 23,
            mink   = 11,
            hdist  = 1,
            entropy_window = 50,
            entropy_k = 5,
            flags  = "pigz unpigz",
            inout  = "in={input[0]} out={output[0]}",
            inout2 = "in2={input[1]} out2={output[1]}" # overriden by child rule
        resources:
            mem = "80g",
        threads: 16
        conda:
            "bbmap"
        shell:
            # find adapter dir:
            'BB_RSRC="$(dirname $(readlink -f $(command -v bbduk.sh)))/resources";'
            # run bbduk:
            'bbduk.sh'
            ' {params.inout} {params.inout2}'
            ' trimq={params.qual} qtrim=r'         # quality trimming
            ' minlength={params.length}'           # length filtering
            ' {params.adapt} '                     # adapter trimming
            ' ktrim=r'                             # 3' side only
            ' k={params.k}'                        # k for adapter matching
            ' mink={params.mink}'                  # k at read end
            ' hdist={params.hdist}'                # hamming distance, allow 1 mismatch
            ' tpe' # trimpairsevenly -- in case adapter detected in only one read
            ' tbo' # trimbyoverlap -- trim if read runs over other reads' end
            ' entropy=0.{params.entropy}'
            ' entropywindow={params.entropy_window}'
            ' entropyk={params.entropy_k}'
            ' {params.flags}'                      # processing settings
            ' threads={threads}'
            ' -Xmx{resources.mem_mb}m'
            ' >{log} 2>&1'

    rule bbmap_trim_se:  # ymp: extends bbmap_trim
        input:
            ["{:prev:}/{target}.{:pairnames[0]:}.fq.gz"]
        output:
            ["{:this:}/{target}.{:pairnames[0]:}.fq.gz"]
        params:
            inout2 = ""

    localrules: bbmap_trim_all
    rule bbmap_trim_all:
        message:
            "BBMap: finished {output}"
        input:
            "{:this:}/{:fq_names:}.fq.gz"
        output:
            touch("{:this:}/all_targets.stamp")


with Stage("dedup_bbmap") as S:
    S.doc("""
    Remove duplicate reads

    Applies `BBMap's <bioconda:bbmap>` "dedupe.sh"

    >>> ymp make toy.dedup_bbmap
    >>> ymp make mpic.dedup_bbmap
    """)
    rule bbmap_dedupe:
        """
        Deduplicate reads using BBMap's dedupe.sh
        """
        message:
            "{:name:}: Processing {input[0]}"
        input:
            "{:prev:}/{:target:}.{:pairnames:}.fq.gz"
        output:
            "{:this:}/{target}.{:pairnames:}.fq.gz"
        log:
            "{:this:}/{target}.log"
        benchmark:
            "benchmarks/{:name:}/{:this:}/{target}.txt"
        resources:
            mem = "80g",
        threads: 4
        conda:
            "bbmap"
        shell:
            "dedupe.sh"
            " unpigz"
            " threads={threads}"
            " in={input[0]}"
            " in2={input[1]}"
            " out=stdout"
            " -Xmx{resources.mem_mb}m"
            " 2>{log}"
            " |"
            " paste - - - -  - - - - | "
            " tee >(cut -f 1-4 | tr \"\t\" \"\\n\" | pigz -p {threads} > {output[0]}) | "
            " cut -f 5-8 | tr \"\t\" \"\\n\" | "
            " pigz -p {threads} > {output[1]}"


    rule bbmap_dedupe_se:
        """
        Deduplicate reads using `BBMap's <bioconda:bbmap>` dedupe.sh
        """
        message:
            "{:name:}: Processing {input}"
        input:
            "{:prev:}/{:target:}.{:pairnames[0]:}.fq.gz"
        output:
            "{:this:}/{target}.{:pairnames[0]:}.fq.gz"
        benchmark:
            "benchmarks/{:name:}/{:this:}/{target}.txt"
        log:
            "{:this:}/{target}.log"
        resources:
            mem = "80g",
        threads: 4
        conda:
            "bbmap"
        shell:
            "dedupe.sh"
            " unpigz"
            " threads={threads}"
            " in={input[0]}"
            " out=stdout"
            " -Xmx{resources.mem_mb}m"
            " 2>{log}"
            " |"
            " pigz -p {threads} > {output[0]}"

    rule bbmap_dedupe_all:
        message:
            "BBMap: dedupe done"
        output:
            touch("{:this:}/all_targets.stamp")
        input:
            "{:this:}/{:fq_names:}.fq.gz"


ruleorder: bbmap_dedupe > bbmap_dedupe_se


with Stage("remove_bbmap", "filter_bbmap") as S:
    S.doc("""
    Filter reads by reference

    This stage aligns the reads with a given reference using `BBMap <bioconda:bbmap>` in fast mode.
    Matching reads are collected in the stage *filter_bbmap* and remaining reads
    are collectec in the stage *remove_bbmap*.

    >>> ymp make toy.ref_phiX.index_bbmap.remove_bbmap
    >>> ymp make toy.ref_phiX.index_bbmap.filter_bbmap
    >>> ymp make mpic.ref_phiX.index_bbmap.remove_bbmap
    """)
    rule bbmap_split:
        message:
            "{:name:}: {input[0]}"
        input:
            fq    = "{:prev:}/{target}.{:pairnames:}.fq.gz",
            ref   = "{:prev:}/{:target:}"
        output:
            clean = "{:this:}/{target}.{:pairnames:}.fq.gz",
            human = "{:that:}/{target}.{:pairnames:}.fq.gz",
            stats = expand("{{:this:}}/{{target}}.{x}", x=bbstats)
        log:
                    "{:this:}/{target}.log"
        benchmark:
            "benchmarks/{:name:}/{:this:}/{target}.txt"
        params:
            stats    = lambda wc, output: ["{}={}".format(x,y) for x,y in zip(bbstats, output.stats)],
            minid    = 0.95,
            maxindel = 3,
            bwr      = 0.16,
            bw       = 12,
            trimq    = 10,
            qtrim    = "rl",
            flags    = "quickmatch fast untrim machineout",
            minhits  = 2,
            inout2   = "in2={input.fq[1]} outu2={output.clean[1]} outm2={output.human[1]}",
        resources:
            mem = "80g",
        threads:
            16
        conda:
            "bbmap"
        shell:
            "bbmap.sh "
            " minid={params.minid} "
            " maxindel={params.maxindel} "
            " bwr={params.bwr} "
            " bw={params.bw} "
            " {params.flags} "
            " minhits={params.minhits} "
            " path={input.ref} "
            " qtrim={params.qtrim} "
            " trimq={params.trimq} "
            " -Xmx{resources.mem_mb}m "
            " in={input.fq[0]} "
            " outu={output.clean[0]} "
            " outm={output.human[0]} "
            " {params.inout2} "
            " threads={threads} "
            " {params.stats} "
            " > {log} 2>&1"

    rule bbmap_split_se:  # ymp: extends bbmap_split
        input:
            fq    = ["{:prev:}/{target}.{:pairnames[0]:}.fq.gz"]
        output:
            clean = ["{:this:}/{target}.{:pairnames[0]:}.fq.gz"],
            human = ["{:that:}/{target}.{:pairnames[0]:}.fq.gz"]
        params:
            inout2 = ""

    rule bbmap_split_all:
        message:
            "BBMap: split complete"
        output:
            touch("{:this:}/all_targets.stamp")
        input:
            "{:this:}/{:fq_names:}.fq.gz"

    rule bbmap_split_all_remove:
        message:
            "BBMap: split complete"
        output:
            touch("{:that:}/all_targets.stamp")
        input:
            "{:that:}/{:fq_names:}.fq.gz"


###
### Primer Filtering
###

with Stage("primermatch_bbmap", "primerfail_bbmap") as S:
    S.doc("""
    Filters reads by matching reference primer using `BBMap's <bioconda:bbmap>` "bbduk.sh".

    >>> ymp make mpic.ref_primers.primermatch_bbmap
    """)

    rule bbduk_primer:
        """
        Splits reads based on primer matching into "primermatch" and "primerfail".
        """
        message:
            "{:name:}: Filtering {wildcards.target} for primer set {input.primer}"
        input:
            fq     = "{:prev:}/{:target:}.{:pairnames:}.fq.gz",
            primer = "{:prev:}/{:target:}.fasta.gz"
        output:
            match  = "{:this:}/{target}.{:pairnames:}.fq.gz.xx",
            fail   = "{:that:}/{target}.{:pairnames:}.fq.gz",
            stats  = expand("{{:this:}}/{{target}}.{x}", x=bbduk_stats)
        log:
            "{:this:}/{target}.log"
        benchmark:
            "benchmarks/{:name:}/{:this:}/{target}.txt"
        threads: 8
        params:
            stats   = lambda wc, output: ["{}={}".format(x,y) for x,y in zip(bbstats, output.stats)],
            k       = 12,
            rl      = 12,
            inout2  = "in2={input.fq[1]} outm2={output.match[1]} outu2={output.fail[1]}"
        resources:
            mem     = "80g",
        conda: "bbmap"
        shell:
            'bbduk.sh'
            ' in={input.fq[0]} outm={output.match[0]} outu={output.fail[0]} '
            ' {params.inout2} '
            ' ref={input.primer}'
            ' k={params.k}'               # match using k-mers
            ' restrictleft={params.rl} '  # only match leftmost n bases
            ' maskmiddle=f'               # don't mask middle base in kmer
            ' rcomp=f'                    # don't check reverse complement
            ' copyundefined=t'            # expand wobbles in input
            ' removeifeitherbad=f'        # "bad" is "match", we want both to match
            ' pigz unpigz'
            ' {params.stats}'
            ' -Xmx{resources.mem_mb}m'
            ' >{log} 2>&1'

    rule bbduk_primer_se:  # ymp: extends bbduk_primer
        input:
            fq = ["{:prev:}/{:target:}.{:pairnames[0]:}.fq.gz"]
        output:
            match = ["{:this:}/{target}.{:pairnames[0]:}.fq.gz"],
            fail  = ["{:that:}/{target}.{:pairnames[0]:}.fq.gz"]
        params:
            inout2 = ""

    rule bbduk_primer_all:
        message:
            "BBMap: primer filter done"
        output:
            touch("{:this:}/all_targets.stamp")
        input:
            "{:this:}/{:fq_names:}.fq.gz"


def ensure_list(x):
    if isinstance(x, str):
        return [x]
    return x

with Stage("map_bbmap") as S:
    S.doc("""
    Map reads using `BBMap <bioconda:bbmap>`

    >>> ymp make toy.assemble_megahit.map_bbmap
    >>> ymp make toy.ref_genome.map_bbmap
    >>> ymp make mpic.ref_ssu.map_bbmap
    """)
    S.add_param("", typ="choice", name="sensitivity",
                value=['F', 'N', 'S', 'VS'], default='N')
    rule bbmap_map:
        """Map read from each (co-)assembly read file to the assembly"""
        message:
            "BBMap mapping {input.fq1} to {input.fa}"
        input:
            fq1 = "{:prev:}/{:target:}.{:pairnames[0]:}.fq.gz",
            fq2 = "{:prev:}/{:target:}.{:pairnames[1]:}.fq.gz",
            fa = "{:prev:}/{:target:}.fasta.gz",
        output:
            bam   = temp("{:this:}/{target}.bam"),
            stats =      "{:this:}/{target}.stats",
            ihist =      "{:this:}/{target}.ihist"
        log:
                         "{:this:}/{target}.log"
        benchmark:
            "benchmarks/{:name:}/{:this:}/{target}.txt"
        params:
            in1 = lambda wc, input: "in="+",".join(ensure_list(input.fq1)),
            in2 = lambda wc, input: "in2="+",".join(ensure_list(input.fq2))
        resources:
            mem = "80g",
        threads:
            8
        conda:
            "bbmap"
        shell:
            "case {params.sensitivity} in"
            "  N) SENS='';; F) SENS='fast';; S) SENS='slow';; VS) SENS='vslow';;"
            "esac;"
            "bbwrap.sh mapper=bbmap"
            " $SENS"
            " threads={threads}"
            " pigz unpigz"          # enable use of pigz for zip/unzip
            # this is disabled because not compiled in conda:
            # " jni"                # use JNI for C alignment algo
            " nodisk "              # don't write index to disk
            " ref={input.fa}"       # reference
            " {params.in1} {params.in2}"
            " out=stdout"           # write sam to stdout
            " machineout statsfile={output.stats}" # parseable stats
            " ihist={output.ihist}" # insert histogram
            " ambiguous=all"        # output all matches
            " mdtag"                # make source recoverable in sam
            " trimreaddescriptions" # use header until space like bowtie2
            " -Xmx{resources.mem_mb}m"     # set JVM heap size
            " 2>{log}"              # log error
            " | samtools view --threads 3 -b -o {output.bam} -" # convert to bam

    rule bbmap_map_SE: # ymp: extends bbmap_map
        input:
            fq2 = []
        params:
            in2 = ""


with Stage("dust_bbmap") as S:
    S.doc("""
    Perform entropy filtering on reads using `BBMap's <bioconda:bbmap>` bbduk.sh

    The parameter ``Enn`` gives the entropy cutoff. Higher values
    filter more sequences.

    >>> ymp make toy.dust_bbmap
    >>> ymp make toy.dust_bbmapE60
    """)
    S.add_param("E", typ="int", name="entropy", default=10)

    rule bbmap_dust:
        message:
            "BBMap: removing low entropy reads"
        input:
            fq = "{:prev:}/{target}.{:pairnames:}.fq.gz"
        output:
            fq = "{:this:}/{target}.{:pairnames:}.fq.gz"
        log:
                 "{:this:}/{target}.log"
        benchmark:
            "benchmarks/{:name:}/{:this:}/{target}.txt"
        threads: 8
        conda: "bbmap"
        params:
            entropy_window = 50,
            entropy_k = 5,
            inout2 = "in2={input.fq[1]} out2={output.fq[1]}",
        resources:
            mem = "80g",
        shell:
            'bbduk.sh'
            ' in={input.fq[0]} out={output.fq[0]}'
            ' {params.inout2}'
            ' entropy=0.{params.entropy}'
            ' entropywindow={params.entropy_window}'
            ' entropyk={params.entropy_k}'
            ' pigz unpigz'
            ' -Xmx{resources.mem_mb}m'
            ' >{log} 2>&1'


with Stage("format_bbmap") as S:
    S.doc("""
    Process sequences with `BBMap's <bioconda:bbmap>` format.sh

    Parameter ``Ln`` filters sequences at a minimum length.

    >>> ymp make toy.assemble_spades.format_bbmapL200
    """)
    S.add_param("L", typ="int", name="length", default=1)
    rule bbmap_reformat:
        message:
            "BBMap: reformatting fasta"
        input:
            fasta = "{:prev:}/{:target:}.fasta.gz"
        output:
            fasta = "{:this:}/{target}.fasta.gz"
        log:
                    "{:this:}/{target}.log"
        benchmark:
            "benchmarks/{:name:}/{:this:}/{target}.txt"
        params:
            multi = lambda wc, input: isinstance(input.fasta, list)
        threads:
            4
        conda:
            "bbmap"
        shell:
            'exec >{log} 2>&1; set -x;'
            '('
            '  if [ "{params.multi}" == "False" ]; then'
            '    gzip -dc {input.fasta};'
            '  else'
            '    for n in {input.fasta}; do'
            '      name="$(basename $n)"'
            '      name="${{name%.fasta.gz}}";'
            '      gzip -dc "$n" | sed "s/^>/>${{name}}_/";'
            '    done;'
            '  fi'
            ') |'
            'reformat.sh'
            ' in=stdin.fasta'
            ' out={output.fasta}'
            ' bgzip'
            ' fastawrap=0'  # sequence on one line
            ' fastaminlen={params.length}'