Snakefile "ymp/rules/00_import.rules"
"""
This file contains the rules dealing with FQ file provisioning and preparation.
"""
###
### Create configured directories if requested
###
localrules: mkdir
rule mkdir:
"""
Auto-create directories listed in ymp config.
Use these as input:
>>> input: tmpdir = ancient(ymp.get_config().dir.tmp)
Or as param:
>>> param: tmpdir = "{:ensuredir.tmp:}"
"""
message: "Creating directory '{output}'"
output: directory("{{x,({})}}".format("|".join(list(ymp.get_config().dir))))
shell: "mkdir -p {output}"
###
### SRA access
###
localrules: prefetch
rule prefetch:
"""
Downloads SRA files into NCBI SRA folder (ncbi/public/sra).
"""
# get path with
# vdb-config /repository/user/main/public/root/
# ?
message:
"Pre-Fetching {wildcards.SRR}"
output:
"{:dir.sra:}/{SRR}.sra"
wildcard_constraints:
SRR = r"[EDS]RR[0-9]+",
conda:
"sratools.yml"
shell: """
prefetch {wildcards.SRR}
"""
rule fastq_dump:
"""
Extracts FQ from SRA files
"""
message:
"Extracting FastQ from {wildcards.SRR}"
output:
"{:dir.scratch:}/SRR/{SRR}_1.fastq.gz",
"{:dir.scratch:}/SRR/{SRR}_2.fastq.gz"
wildcard_constraints:
SRR = r"[EDS]RR[0-9]+",
params:
outdir = "{:ensuredir.scratch:}/SRR",
p = lambda wc, threads: int(threads/2+.5),
resources:
mem = "200M",
conda:
"sratools.yml"
threads:
4
# FIXME
# the two cut processes use about 1 cpu each, fastqdump 1/4 and pgzip about 1 each.
# not ideal. not sure why cut needs so much time.
shell: """
fastq-dump {wildcards.SRR} \
--split-files \
--readids \
--dumpbase \
--skip-technical \
--clip \
--read-filter pass \
--stdout | \
paste - - - - - - - - | \
tee >(cut -f 1-4 | tr "\t" "\\n" | pigz -p {params.p} > {output[0]}) | \
cut -f 5-8 | tr "\t" "\\n" | pigz -p {params.p} > {output[1]}
"""
with Stage("") as S:
S.doc("""
Imports raw read files into YMP.
>>> ymp make toy
>>> ymp make mpic
""")
S.add_param(typ="choice", name="project",
value=list(ymp.get_config().projects.keys()), key="")
_ymp_stage_import = S
localrules: symlink_raw_reads
rule symlink_raw_reads:
"""Normalize FQ names by creating symlinks to original files"""
message:
"Creating symlink {output} -> {input}"
input:
# extract path from config file:
"{:raw_reads_source_path(target, 0):}",
"{:raw_reads_source_path(target, 1):}"
output:
"{:this:}/{target}.{:pairnames:}.fq.gz"
run:
for n in range(len(input)):
if not os.path.isabs(input[n]):
input[n] = os.path.join("..", input[n])
os.symlink(input[n], output[n])
localrules: symlink_raw_reads_SE
rule symlink_raw_reads_SE: # ymp: extends symlink_raw_reads
input:
"{:raw_reads_source_path(target, 0):}"
output:
["{:this:}/{target}.{:pairnames[0]:}.fq.gz"]
rule export_qiime_map_file:
message:
"Creating Qiime-Style map file"
output:
"{:this:}/qiime_mapping.tsv"
params:
variables = "{:variables:}",
project = "{:project_name:}"
run:
# Note: Do not use variables inside of "run" that are reassigned
# elsewhere. This is why we pass "S" via params. Otherwise,
# it would be the last assigned value of S, so likely
# some other stage.
import csv
project = ymp.get_config().projects[params.project]
cols = params.variables
if "Description" in cols:
desc_idx = cols.index("Description")
cols = cols[:desc_idx] + cols[desc_idx+1:] + [cols[desc_idx]]
fake_description = False
else:
fake_description = True
with open(output[0], "w") as out:
writer = csv.writer(out, delimiter="\t")
writer.writerow(cols + (["Description"] if fake_description else []))
for row in project.iter_samples(cols):
if fake_description:
row = list(row) + ['']
writer.writerow(row[1:])
# TODO: Rename bccol to "BarcodeSequence"
# Fake LinkerPrimerSequence col if not exists
# Make first/index column be called "#SampleID"