Snakefile "ymp/rules/star.rules"

Env(name="star", base="bioconda", packages="star")

rule star_index:
    """Build Genome Index for Star"""
    message:
        "Star: Indexing {input.contigs}"
    input:
        contigs = "{path}/{source}.fasta",
        gtf     = "{path}/{source}.gtf"
    output:
        gdir    = "{path}.index/{source}.star/",
        index   = "{path}.index/{source}.star/SA"
    log:
        std     = "{path}.index/{source}.star.log",
        log     = "{path}.index/{source}.star/Log.txt"
    threads:
        16
    params:
        overhang = 100
    resources:
        mem = "32g",
    shadow:
        "shallow"
    conda:
        "star"
    shell: """
    STAR \
      --runThreadN {threads} \
      --limitGenomeGenerateRAM $(({resources.mem_mb}-1000))000000 \
      --runMode genomeGenerate \
      --genomeDir {output.gdir} \
      --genomeFastaFiles {input.contigs} \
      --sjdbGTFfile {input.gtf} \
      --sjdbOverhang {params.overhang} \
      >{log.std} 2>&1
    mv Log.txt {log.log}
    """
    # TODO:
    # - pass --genomeSAindexNbases =min(14, math.log2(genomelen)/2-1)


with Stage("map_star") as S:
    S.doc("""
    Map RNA-Seq reads with STAR
    """)
    rule star_map:
        input:
            index = "{:reference.dir:}.index/{target}.star/SA",
            fq  = "{:prev:}/{source}.{:pairnames:}.fq.gz"
        output:
            bamgn = "{:this:}/{target}.{source}.bam",
            bamtr = "{:this:}/{target}-annotated.{source}.bam",
            sj    = "{:this:}/{target}.{source}.SJ.out.tab"
        log:
            std = "{:this:}/{target}.{source}.log",
            log = "{:this:}/{target}.{source}.Log.out",
            prg = "{:this:}/{target}.{source}.Log.progress.out",
            fin = "{:this:}/{target}.{source}.Log.final.out"
        params:
            outprefix = "{:this:}/{target}.{source}.",
            multimap_nmax = 10,
            quantmode = "TranscriptomeSAM",
            tmpdir = "{params.outprefix}_STAR_tmp"
        resources:
            mem = "32g",
        threads:
            16
        conda:
            "star"
        shell: """
        STAR \
        --genomeDir $(dirname {input.index}) \
        --genomeLoad NoSharedMemory \
        --runThreadN {threads} \
        --readFilesIn {input.fq} \
        --readFilesCommand zcat \
        --outFileNamePrefix {params.outprefix} \
        --outSAMtype BAM Unsorted \
        --outSAMunmapped Within \
        --outFilterMultimapNmax {params.multimap_nmax} \
        --outTmpDir {params.tmpdir} \
        --quantMode {params.quantmode} \
        >{log.std} 2>&1

        mv {params.outprefix}Aligned.out.bam {output.bamgn}
        mv {params.outprefix}Aligned.toTranscriptome.out.bam {output.bamtr}
        """

    # TODO: SE mode