Snakefile "ymp/rules/00_import.rules"

"""
This file contains the rules dealing with FQ file provisioning and preparation. 
"""

###
###  Create configured directories if requested
###

localrules: mkdir
rule mkdir:
    """
    Auto-create directories listed in ymp config.

    Use these as input:
    >>> input: tmpdir = ancient(ymp.get_config().dir.tmp)
    Or as param:
    >>> param: tmpdir = "{:ensuredir.tmp:}"

    """
    message: "Creating directory '{output}'"
    output: directory("{{x,({})}}".format("|".join(list(ymp.get_config().dir))))
    shell: "mkdir -p {output}"

###
###  SRA access 
###


localrules: prefetch
rule prefetch:
    """
    Downloads SRA files into NCBI SRA folder (ncbi/public/sra).
    """
    # get path with
    # vdb-config /repository/user/main/public/root/
    # ?
    message:
        "Pre-Fetching {wildcards.SRR}"
    output:
        "{:dir.sra:}/{SRR}.sra"
    wildcard_constraints:
        SRR = r"[EDS]RR[0-9]+",
    conda:
        "sratools.yml"
    shell: """
    prefetch {wildcards.SRR}
    """


rule fastq_dump:
    """
    Extracts FQ from SRA files
    """
    message:
        "Extracting FastQ from {wildcards.SRR}"
    output:
        "{:dir.scratch:}/SRR/{SRR}_1.fastq.gz",
        "{:dir.scratch:}/SRR/{SRR}_2.fastq.gz"
    wildcard_constraints:
        SRR = r"[EDS]RR[0-9]+",
    params:
        outdir = "{:ensuredir.scratch:}/SRR",
        p      = lambda wc, threads: int(threads/2+.5),
    resources:
        mem = "200M",
    conda:
        "sratools.yml"
    threads:
        4
    # FIXME
    # the two cut processes use about 1 cpu each, fastqdump 1/4 and pgzip about 1 each.
    # not ideal. not sure why cut needs so much time. 
    shell: """
    fastq-dump {wildcards.SRR} \
        --split-files \
        --readids \
        --dumpbase \
        --skip-technical \
        --clip \
        --read-filter pass \
        --stdout | \
      paste - - - -  - - - - | \
      tee >(cut -f 1-4 | tr "\t" "\\n" | pigz -p {params.p} > {output[0]}) | \
      cut -f 5-8 | tr "\t" "\\n" | pigz -p {params.p} > {output[1]}
    """

with Stage("") as S:
    S.doc("""
    Imports raw read files into YMP.

    >>> ymp make toy
    >>> ymp make mpic
    """)
    S.add_param(typ="choice", name="project",
                value=list(ymp.get_config().projects.keys()), key="")

    _ymp_stage_import = S
    localrules: symlink_raw_reads
    rule symlink_raw_reads:
        """Normalize FQ names by creating symlinks to original files"""
        message:
            "Creating symlink {output} -> {input}"
        input:
            # extract path from config file:
            "{:raw_reads_source_path(target, 0):}",
            "{:raw_reads_source_path(target, 1):}"
        output:
            "{:this:}/{target}.{:pairnames:}.fq.gz"
        run:
            for n in range(len(input)):
                if not os.path.isabs(input[n]):
                    input[n] = os.path.join("..", input[n])
                os.symlink(input[n], output[n])

    localrules: symlink_raw_reads_SE
    rule symlink_raw_reads_SE: # ymp: extends symlink_raw_reads
        input:
            "{:raw_reads_source_path(target, 0):}"
        output:
            ["{:this:}/{target}.{:pairnames[0]:}.fq.gz"]

    rule export_qiime_map_file:
        message:
            "Creating Qiime-Style map file"
        output:
            "{:this:}/qiime_mapping.tsv"
        params:
            variables = "{:variables:}",
            project = "{:project_name:}"
        run:
            # Note: Do not use variables inside of "run" that are reassigned
            #       elsewhere. This is why we pass "S" via params. Otherwise,
            #       it would be the last assigned value of S, so likely
            #       some other stage.
            import csv
            project = ymp.get_config().projects[params.project]
            cols = params.variables
            if "Description" in cols:
                desc_idx = cols.index("Description")
                cols = cols[:desc_idx] + cols[desc_idx+1:] + [cols[desc_idx]]
                fake_description = False
            else:
                fake_description = True

            with open(output[0], "w") as out:
                writer = csv.writer(out, delimiter="\t")
                writer.writerow(cols + (["Description"] if fake_description else []))
                for row in project.iter_samples(cols):
                    if fake_description:
                        row = list(row) + ['']
                    writer.writerow(row[1:])

                # TODO: Rename bccol to "BarcodeSequence"
                #       Fake LinkerPrimerSequence col if not exists
                #       Make first/index column be called "#SampleID"