Snakefile "ymp/rules/samtools.rules"

Env(name="samtools", base="bioconda", packages=["samtools", "htslib"])

with Stage("extract_reads") as S:
    S.doc("""
    Extract reads from BAM file using ``samtools fastq``.

    Parameters ``fn``, ``Fn`` and ``Gn`` are passed through to
    ``samtools view``. Reads are output *only* if all bits in ``f`` are
    set, *none* of the bits in ``F`` are set, and *any* of the bits in
    ``G`` is *unset*.

    1: paired
    2: proper pair (both aligned in right orientation)
    4: unmapped
    8: other read unmapped

    Some options include:

    - f2: correctly mapped (only proper pairs)
    - F12: both ends mapped (but potentially "improper")
    - G12: either end mapped
    - F2: not correctly mapped (not proper pair, could also be unmapped)
    - f12: not mapped (neither read mapped)
    """)
    S.add_param("f", typ="int", name="f", default=0)
    S.add_param("F", typ="int", name="F", default=0x900)
    S.add_param("G", typ="int", name="G", default=0)
    rule samtools_fastq:
        message:
            "{:name:}: {output.pairs[0]}"
        input:
            bam =  "{:prev:}/{target}.bam"
        output:
            pairs = "{:this:}/{target}.{:pairnames:}.fq.gz"
        log:
                    "{:this:}/{target}.log"
        benchmark:
            "benchmarks/{:name:}/{:this:}/{target}.txt"
        threads:
            4
        conda:
            "samtools"
        shell:
            "samtools fastq"
            #" -0 /dev/null"
            #" -s /dev/null"
            " -1 {output.pairs[0]}"
            " -2 {output.pairs[1]}"
            " --threads {threads}"
            " -f {params.f}"
            " -F {params.F}"
            " -G {params.G}"
            " {input}"
            " 2>&1 >{log}"


with Stage("extract_seqs") as S:
    S.doc("""
    Extract sequences from ``.fasta.gz`` file using ``samtools faidx``

    Currently requires a ``.blast7`` file as input.

    Use parameter ``Nomatch`` to instead keep unmatched sequences.
    """)
    S.add_param("Nomatch", typ="flag", name="match",
                value="-13", default="-12")

    rule samtools_select_blast:
        message:
            "{:name:}: IDs from blast {output.regions)"
        input:
            fasta = "{:prev:}/{:target:}.fasta.gz",
            blast7 = "{:prev:}/{:target:}.blast7.gz"
        output:
            regions = temp("{:this:}/{target}.ids")
        log:
            "{:this:}/{target}.ids_from_blast.log"
        benchmark:
            "benchmarks/{:name:}/{:this:}/{target}.ids_from_blast.txt"
        threads:
            1
        shell:
            r"exec >{log} 2>&1;"
            r"comm {params.match}"
            r" <(gzip -dc {input.blast7} | grep -v '^#' | cut -f1 -d $'\t' | sort | uniq)"
            r" <(gzip -dc {input.fasta} | grep '^>' | "
            r"   sed -n '/^>/ s/>\([^[:space:]]*\).*/\1/p' | sort)"
            r" >{output.regions}"

    rule samtools_faidx:
        message:
            "{:name:}: {output.fasta}"
        input:
            fasta = "{:prev:}/{:target:}.fasta.gz",
            regions = "{:this:}/{target}.ids"
        output:
            fasta = "{:this:}/{target}.fasta.gz"
        log:
                    "{:this:}/{target}.log"
        benchmark:
            "benchmarks/{:name:}/{:this:}/{target}.txt"
        threads:
            2
        conda:
            "samtools"
        shadow:
            "minimal"
        shell:
            'if [ 0 -eq $(wc -l < {input.regions}) ]; then'
            '  echo "YMP: no sequences" >{log};'
            '  echo -n;'
            'else'
            '  samtools faidx'
            '   --length 99999999'
            '   --region-file {input.regions}'
            '   {input.fasta}'
            '   2>{log};'
            'fi | '
            'bgzip '
            ' --stdout'
            ' --threads {threads}'
            ' >{output.fasta}'


with Stage("coverage_samtools") as S:
    S.doc("""
    Computes coverage from a sorted bam file using ``samtools coverage``
    """)
    rule samtools_coverage:
        message:
            "{:name:} {output}"
        input:
            "{:prev:}/{target}.sorted.bam"
        output:
            "{:this:}/{target}.coverage"
        log:
            "{:this:}/{target}.log"
        benchmark:
            "benchmarks/{:name:}/{:this:}/{target}.txt"
        conda:
            "samtools"
        resources:
            mem = "4g",
        threads:
            1
        shell:
            'samtools coverage'
            ' {input}'
            ' -o {output}'
            ' >{log} 2>&1'