Snakefile "ymp/rules/trinity.rules"

from ymp.util import check_input, filter_input

Env(name="trinity", base="bioconda", packages="trinity")

with Stage("assemble_trinity") as S:
    rule trinity:
        message:
            "Trinity: assembling {wildcards.target}"
        input:
            r1 = "{:prev:}/{:target:}.{:pairnames[0]:}.fq.gz",
            r2 = "{:prev:}/{:target:}.{:pairnames[1]:}.fq.gz"
        output:
            fa = "{:this:}/{target}.fasta.gz",
            tm = temp("{:this:}/{target}.timing"),
            mp = "{:this:}/{target}.fasta.gene_trans_map"
        log:
            "{:this:}/{target}.log"
        params:
            r1 = filter_input("r1", also=["r2"], minsize=1, join=","),
            r2 = filter_input("r2", also=["r1"], minsize=1, join=","),
            # Allow failure if we have <50 sequences
            sufficient_input = check_input(["r1", "r2"], minlines=50*4*2),
            min_contig_length = 200,
            lib_type = "FR",
            # outdir must contain word "trinity"
            outdir = "{:this:}/{target}.trinity.tmp"
        resources:
            mem = "32g",
        threads:
            24
        conda:
            "trinity"
        shell: """
        make_empty_result() {{
           echo "YMP: skipping assembly - $*" >> {log}
           mkdir -p {params.outdir}
           touch {params.outdir}/Trinity.fasta
           touch {params.outdir}/Trinity.timing
           touch {params.outdir}/Trinity.fasta.gene_trans_map
        }}
        if [ -z "{params.r1}" ]; then
           make_empty_result "all input empty"
        else
           ARGS=
           ARGS="$ARGS --seqType fq"
           ARGS="$ARGS --SS_lib_type {params.lib_type}"
           ARGS="$ARGS --max_memory {resources.mem_gb}G"
           ARGS="$ARGS --left {params.r1} --right {params.r2}"
           ARGS="$ARGS --CPU {threads}"
           ARGS="$ARGS --min_contig_length {params.min_contig_length}"
           ARGS="$ARGS --output {params.outdir}"
           echo "YMP: Running Trinity $ARGS" >> {log}
           if ! Trinity $ARGS >>{log} 2>&1; then
              if [ "{params.sufficient_input}" = "False" ]; then
                 make_empty_result "Assuming failure due to low read count"
              else
                 exit 1
              fi
           fi
        fi

        gzip -c {params.outdir}/Trinity.fasta > {output.fa}
        mv {params.outdir}/Trinity.timing {output.tm}
        mv {params.outdir}/Trinity.fasta.gene_trans_map {output.mp}
        rm -rf {params.outdir}
        """

    rule trinity_stats:
        message:
            "Trinity: collecting assembly stats"
        input:
            "{:this:}/{target}.fasta.gz"
        output:
            "{:this:}/{target}.trinity-stats"
        conda:
            "trinity"
        shell:
            "TrinityStats.pl {input} > {output}"