Snakefile "ymp/rules/rsem.rules"

Env(name="rsem", base="bioconda", packages="rsem")

RSEM_IDX = "chrlist grp idx.fa n2g.idx.fa seq ti transcripts.fa".split()

rule rsem_index:
    """Build Genome Index for RSEM"""
    message:
        "RSEM: Indexing {input.contigs}"
    input:
        contigs = "{path}/{source}.fasta",
        gtf     = "{path}/{source}.gtf"
    output:
        index   = expand("{{params.index}}.{ext}", ext=RSEM_IDX)
    log:
        "{params.index}.log"
    params:
        index   = "{path}.index/{source}.rsem",
    resources:
        mem = "20g",
    threads:
        1
    conda:
        "rsem"
    shell: """
    rsem-prepare-reference --gtf {input.gtf} {input.contigs} {params.index}  >{log} 2>&1
    """

with Stage("quant_rsem") as S:
    S.doc("""
    Quantify transcripts using RSEM
    """)
    rule rsem_quant:
        message:
            "RSEM: calculating expression"
        input:
            bam = "{:prev:}/{target}-annotated.{source}.bam",
            idx = expand("{{:reference.dir:}}.index/{{target}}.rsem.{ext}",
                         ext=RSEM_IDX)
        output:
            "{params.outprefix}.genes.results",
            "{params.outprefix}.isoforms.results"
        log:
            "{params.outprefix}.log"
        params:
            index = "{:reference.dir:}.index/{target}.rsem",
            outprefix = "{:this:}/{target}.{source}",
            forward_prob = 0, # P of having fwd read
        resources:
            mem = "16G",
        threads:
            16
        conda:
            "rsem"
        shell:
            "rsem-calculate-expression "
            " -p {threads} "
            " --bam "
            " --no-bam-output "
            " --estimate-rspd " # estimate read start position
            " --calc-ci" # calculate 95% credibility intervals and posterior mean estimates
            " --ci-memory $(({resources.mem_mb} / 16 * 10)) "
            " --forward-prob {params.forward_prob} "
            " --paired-end "
            " {input.bam} "
            " {params.index} "
            " {params.outprefix} "
            " >{log} 2>&1 "

    rule rsem_all_for_target:
        message:
            "RSEM: finished {output}"
        input:
            "{:this:}/{target}.{:sources:}.genes.results",
        output:
            touch("{:this:}/all_{target}")

    rule rsem_all:
        message:
            "RSEM: finished {output}"
        input:
            "{:this:}/all_{:targets:}"
        output:
            touch("{:this:}/all_targets.stamp")

    # TODO: SE mode