From 30a574a0d2fa91e9b0680d1f03dc3a0b777708fb Mon Sep 17 00:00:00 2001
From: Lucy McNeill <lmcneill@svi.edu.au>
Date: Fri, 16 Apr 2021 16:16:02 +1000
Subject: [PATCH] add project template skeleton

---
 CITATION                   |   1 +
 Dockerfile                 |  19 +++
 LICENSE                    |   4 +
 README.md                  |  79 ++++++++-
 Snakefile                  | 323 +++++++++++++++++++++++++++++++++++++
 _workflowr.yml             |  10 ++
 analysis/_site.yml         |  21 +++
 analysis/about.Rmd         |  39 +++++
 analysis/index.Rmd         | 156 ++++++++++++++++++
 analysis/license.Rmd       |  15 ++
 cluster.json               |  11 ++
 data/README.md             |   6 +
 environment.yml            |  25 +++
 envs/myenv.yaml            |  25 +++
 org/README.md              |   3 +
 org/project_management.org |  34 ++++
 output/README.md           |   3 +
 references/README.md       |   3 +
 resources/README.md        |   3 +
 source/README.md           |   3 +
 20 files changed, 782 insertions(+), 1 deletion(-)
 create mode 100644 CITATION
 create mode 100644 Dockerfile
 create mode 100644 LICENSE
 create mode 100755 Snakefile
 create mode 100644 _workflowr.yml
 create mode 100644 analysis/_site.yml
 create mode 100644 analysis/about.Rmd
 create mode 100644 analysis/index.Rmd
 create mode 100644 analysis/license.Rmd
 create mode 100644 cluster.json
 create mode 100644 data/README.md
 create mode 100644 environment.yml
 create mode 100644 envs/myenv.yaml
 create mode 100644 org/README.md
 create mode 100644 org/project_management.org
 create mode 100644 output/README.md
 create mode 100644 references/README.md
 create mode 100644 resources/README.md
 create mode 100644 source/README.md

diff --git a/CITATION b/CITATION
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/CITATION
@@ -0,0 +1 @@
+
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..4e0bad8
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,19 @@
+FROM nfcore/base
+
+LABEL authors="dmccarthy@svi.edu.au" \
+    maintainer="Davis McCarthy <dmccarthy@svi.edu.au>" \
+    description="Docker image containing all requirements for AAAA_2019_Project-Template"
+
+RUN apt-get update && \
+    apt-get -y upgrade && \
+    apt-get install -y --no-install-recommends \
+        build-essential \
+        curl \
+        git \
+        libbz2-dev \
+        zlib1g-dev \
+        && rm -rf /var/lib/apt/lists/*
+
+COPY environment.yml /
+RUN conda env create -f /environment.yml python=3.6 && conda clean -a
+ENV PATH /opt/conda/envs/aaaa-2021-Project-Template/bin:$PATH
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..bbbda13
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,4 @@
+Copyright (C) 2019, Davis McCarthy
+
+This work is copyrighted under a  Creative Commons Attribution-ShareAlike 4.0
+International license. For details, see https://creativecommons.org/licenses/by-sa/4.0/legalcode.
diff --git a/README.md b/README.md
index a4c240f..c2aeb7a 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,79 @@
-# synapsis
+# Project AAAA_2019_Project-Template
 
+## Project overview
+
+[Write a concise overview of the project]
+
+## Contributors
+
+[List contributors to the project, affiliations, and if appropriate contact details]
+
+## Project setup
+
+### Define project name and identifier
+
+To use this template project for a new project, first copy the directory or clone the repository to a new location.
+
+The convention for project names is that they are identified by a randomly-generated four-letter code (in the format `[A-Z][AEIOU][A-Z][A-Z][A-Z]`), the year in which the project commenced and a project name (title case with words separated by hyphens). These elements are separated by underscores.
+
+Thus, this template project has a valid identifier ("project ID"): `AAAA_2019_Project-Template`.
+
+R code to generate a unique four-letter project code (setting a random seed based on the date it is run):
+```
+set.seed(as.numeric(Sys.Date()))
+c(sample(LETTERS, 1), sample(c("A","E","I","O","U"), 1), sample(LETTERS, 2))
+```
+
+### Update template for new project
+
+
+
+Specify the way to cite the project in the `CITATION` file. This will likely need updating over the course of the project.
+
+Change the seed for random number generation for the project in `_workflowr.yml` to allow reproducibility of anlayses that use "random" numbers. A sensible choice is the date that the project is setup in YYYYMMDD format (i.e. a numeric value).
+
+Check that the `LICENSE` file is appropriate. Unless there is good reason to prefer something different, we strongly prefer open, permissive licenses to make our work as widely accessible and useable as possible.
+
+The `Snakefile` file that defines the Snakemake workflow will need to be rewritten, but the included file in this template provides a useful starting point and some handy patterns to exploit.
+
+The `analysis/about.Rmd`, `analysis/index.Rmd` and `analysis/license.Rmd` files need to be completed, but the existing templates from a previous project provide a useful guide and starting point.
+
+## Project organisation and management
+
+### File naming conventions
+
+Files should have names that follow this pattern: `<project code>_<date file created>_<descriptive name>.<file extension>`. The four-letter project code should be used (e.g. AAAA). The YYYY-MM-DD format should be used (e.g. 2019-01-01 for January 1 2019). Hyphens should be used to separate words in the descriptive name (e.g. exploratory-data-analysis). 
+
+Thus a valid file name is `AAAA_2019-01-01_exploratory-data-analysis.Rmd`.
+
+Applying these conventions ensures that user-created files are easily identifiable and uniquely named.
+
+### Correspondence about the project
+
+All emails sent regarding the project should have the four-letter project code at the start of the email subject (e.g. `AAAA - <topic of email>`). This ensures that email correspondence for the project is easily searchable in overstuffed email archives and inboxes.
+
+Particularly important emails should be exported to PDF and saved in a subfolder (e.g. `correspondence`) of the `org` folder.
+
+### To-dos and notes
+
+The `org` folder is intended to house files used for organising and managing the project. These might include markdown (`.md`) files for notes, and `.org` mode files (another plaintext document format compatible with Emacs org-mode and org-mode emulators in other modern text editors) for to-do lists and outlines.
+
+There is an included org-mode file `org/project_management.org` that can act as a template for organising the project with a roadmap and to-do lists.
+
+## Reproducibility and version control
+
+We aim to make all of our projects and analyses completely open and reproducible. The [workflowr][] package makes this aim easier to achieve by providing a set of tools and conventions for reproducible analysis workflows in R and the simultaneous building of a website that presents the analyses, with source code, in a readable way. 
+
+The system integrates seamlessly with the git version control system.
+
+As many project files as possible should be under version control. All user-created text files (e.g. `.md`, `.Rmd`, etc) and code (`.R`, `.py`, etc. files) should be under version control. Files that are large in size should not be under version control (too many large files in the repository makes git become unwieldy), and, in general, there is no need for files produced by code or analysis files to be versioned as running the workflow will produce them.
+
+Docker containers encapsulating software environments enable reproducibility by allowing the same software and environments to be easily used by different people across different platforms. As such, we use them extensively.
+
+Workflow management software makes a very big difference when trying to run complicated computational workflows and making them portable across local, cluster, and other HPC computing environments.
+
+## Acknowledgements
+
+This project is a [workflmiowr][] project. Making use of the workflowr package for reproducible analyses dictates certain structures for the project file.
+
+[workflowr]: https://github.com/jdblischak/workflowr
diff --git a/Snakefile b/Snakefile
new file mode 100755
index 0000000..511d22c
--- /dev/null
+++ b/Snakefile
@@ -0,0 +1,323 @@
+
+"""
+Snakefile for 
+
+Author: Davis McCarthy
+Affiliation: St Vincent's Institute of Medical Research and the University of Melbourne
+
+Run: snakemake -s Snakefile_canopy --jobs 1000 --latency-wait 30 --cluster-config cluster.json --cluster 'bsub -J {cluster.name} -q {cluster.queue} -n {cluster.n} -R "rusage[mem={cluster.memory}]" -M {cluster.memory}  -o {cluster.output} -e {cluster.error}' --keep-going --rerun-incomplete
+
+Davis McCarthy, 02 January 2019
+"""
+
+import glob
+import os
+from subprocess import run
+import subprocess
+import pandas as pd
+import re
+import h5py
+
+shell.prefix("set -euo pipefail;")
+
+donors = ['bima', 'bubh', 'ceik', 'ciwj', 'cuhk', 'deyz', 'diku', 'eipl', 'eofe', 'euts', \
+        'fawm', 'feec', 'fiaj', 'fikt', 'garx', 'gesg', 'gifk', 'hehd', 'heja', 'hipn', 'ieki', \
+        'jogf', 'joxm', 'kajh', 'kuco', 'laey', 'lexy', 'melw', 'miaj', 'naju', 'nusw', 'oaaz', \
+        'oaqd', 'oicx', 'oilg', 'pamv', 'pelm', 'pipw', 'puie', 'qayj', 'qolg', 'qonc', 'rozh', \
+        'rutc', 'sebz', 'sehl', 'sohd', 'tixi', 'toss', 'ualf', 'vabj', 'vass', 'vils', 'vuna', \
+        'wahn', 'wetu', 'wigw', 'wopl', 'wuye', 'xugn', 'xuja', 'zihe', 'zoxy']
+## too few variants for clonal analysis:
+singlecell_donors_all = ['bima', 'bubh', 'ceik', 'ciwj', 'cuhk', 'deyz', 'diku',\
+                         'eipl', 'eofe', 'euts', 'fawm', 'feec', 'fiaj', 'fikt',\
+                          'garx', 'gesg', 'gifk', 'hehd', 'heja', 'hipn', 'ieki',\
+                          'joxm', 'kajh', 'kuco', 'laey', 'lexy', 'melw',\
+                          'miaj', 'naju', 'nusw', 'oaaz', 'oaqd', 'oilg',\
+                          'pamv', 'pelm', 'pipw', 'puie', 'qayj', 'qolg', 'qonc',\
+                          'rozh', 'rutc', 'sebz', 'sehl', 'sohd', 'toss', 'ualf',\
+                          'vabj', 'vass', 'vils', 'vuna', 'wahn', 'wetu', 'wigw',\
+                          'wopl', 'wuye', 'xugn', 'xuja', 'zihe', 'zoxy'] # 60 donors
+## lenient variant filtering
+## donors with <10 variants with coverage in at least one cell:
+## bima, bubh, ceik, cuhk, deyz, diku, dons, eika, fiaj, gifk, hehd, jogf, kajh, lise, pamv, pelm, rutc, sebz, tolg, toss, tuju, vabj, wigw, wopl, wuye, xuja, zihe
+## not enough QC-passing cells (<30): ciwj, eipl, eofe, miaj, oaqd, 
+donors_lenient_all = ['euts', 'fawm', 'feec', 'fikt', \
+    'garx', 'gesg', 'heja', 'hipn', 'ieki', 'joxm', 'kuco', 'laey', 'lexy', 'melw', \
+    'naju', 'nusw', 'oaaz', 'oilg', 'pipw', 'puie', 'qayj', 'qolg', 'qonc', 'rozh', \
+    'sehl', 'sohd', 'ualf', 'vass', 'vils', 'vuna', 'wahn', 'wetu', 'xugn', 'zoxy'] ## 34 donors  
+## Canopy will not fit (variant clustering fails): melw, sohd
+donors_lenient_cell_cov = ['euts', 'fawm', 'feec', 'fikt', 'garx', 'gesg', \
+    'heja', 'hipn', 'ieki', 'joxm', 'kuco', 'laey', 'lexy', 'naju', 'nusw', \
+    'oaaz', 'oilg', 'pipw', 'puie', 'qayj', 'qolg', 'qonc', 'rozh', 'sehl', \
+    'ualf', 'vass', 'vils', 'vuna', 'wahn', 'wetu', 'xugn', 'zoxy'] ## 32 donors
+## strict variant filtering
+## donors with <10 variants with coverage in at least one cell:
+## bima, bubh, ceik, ciwj, cuhk, deyz, diku, dons, eika, fiaj, gifk, hehd, jogf, kajh, lexy, lise, pamv, pelm, rutc, sebz, tolg, toss, tuju, vabj, vils, wigw, wopl, wuye, xuja, zihe
+## not enough QC-passing cells (<30): eipl, eofe, melw, miaj, oaqd
+donors_strict_all = ['euts', 'fawm', 'feec', 'fikt', 'garx', 'gesg', \
+    'heja', 'hipn', 'ieki', 'joxm', 'kuco', 'laey', 'naju', 'nusw', \
+    'oaaz', 'oilg', 'pipw', 'puie', 'qayj', 'qolg', 'qonc', 'rozh', 'sehl', \
+    'sohd', 'ualf', 'vass', 'vuna', 'wahn', 'wetu', 'xugn', 'zoxy'] # 31 donors
+## Canopy will not fit (variant clustering fails): kuco, sohd
+donors_strict_cell_cov = ['euts', 'fawm', 'feec', 'fikt', 'garx', 'gesg', \
+    'heja', 'hipn', 'ieki', 'joxm', 'laey', 'naju', 'nusw', \
+    'oaaz', 'oilg', 'pipw', 'puie', 'qayj', 'qolg', 'qonc', 'rozh', 'sehl', \
+    'ualf', 'vass', 'vuna', 'wahn', 'wetu', 'xugn', 'zoxy'] # 29 donors
+
+sce_list = {}
+sce_list['filt_lenient'] = {}
+sce_list['filt_lenient']['all_filt_sites'] = expand(\
+    'data/sces/sce_{donor}_with_clone_assignments.filt_lenient.all_filt_sites.rds',\
+    donor = donors_lenient_all)
+sce_list['filt_lenient']['cell_coverage_sites'] = expand(\
+    'data/sces/sce_{donor}_with_clone_assignments.filt_lenient.cell_coverage_sites.rds',\
+    donor = donors_lenient_cell_cov)
+sce_list['filt_strict'] = {}
+sce_list['filt_strict']['all_filt_sites'] = expand(\
+    'data/sces/sce_{donor}_with_clone_assignments.filt_strict.all_filt_sites.rds',\
+    donor = donors_strict_all)
+sce_list['filt_strict']['cell_coverage_sites'] = expand(\
+    'data/sces/sce_{donor}_with_clone_assignments.filt_strict.cell_coverage_sites.rds',\
+    donor = donors_strict_cell_cov)
+sces_flat = []
+sces_flat.append(sce_list['filt_lenient']['all_filt_sites'])
+sces_flat.append(sce_list['filt_lenient']['cell_coverage_sites'])
+sces_flat.append(sce_list['filt_strict']['all_filt_sites'])
+sces_flat.append(sce_list['filt_strict']['cell_coverage_sites'])
+sces_flat = [filename for elem in sces_flat for filename in elem]
+
+rule all:
+    input:
+        expand('data/exome-point-mutations/high-vs-low-exomes.v62.ft.filt_{strictness}-{donor}.txt.gz',\
+            strictness = ['lenient', 'strict'], donor = singlecell_donors_all),
+        expand('data/raw/mpileup/{donor}.mpileup.vcf{suffix}', \
+            donor = singlecell_donors_all, suffix = ['.gz', '.gz.csi']),
+        expand('data/sces/sce_{donor}_with_clone_assignments.{strictness}.{sites}.rds',\
+            donor = singlecell_assign_donors, strictness = ['filt_strict', 'filt_lenient'],\
+            sites = ['all_filt_sites', 'cell_coverage_sites']),
+        expand('reports/de_pathway/de_pathway.{cells}.{strictness}.{sites}.html', \
+             cells = ['unst_cells'], strictness = ['filt_strict', 'filt_lenient'],\
+             sites = ['all_filt_sites', 'cell_coverage_sites']), # 'cell_coverage_sites'
+        expand('reports/de_pathway/de_pathway.{cells}.cellcycle_analyses.{strictness}.{sites}.html', \
+             cells = ['unst_cells'], strictness = ['filt_strict', 'filt_lenient'],\
+             sites = ['all_filt_sites', 'cell_coverage_sites']), # 'cell_coverage_sites'
+        expand('reports/de_pathway/de_pathway.{cells}.permutations.{strictness}.{sites}.html', \
+             cells = ['unst_cells'], strictness = ['filt_strict', 'filt_lenient'],\
+             sites = ['all_filt_sites', 'cell_coverage_sites']),  
+        expand('data/exome-point-mutations/high-vs-low-exomes.v62.ft.alldonors-{strictness}.all_filt_sites.ped', \
+            strictness = ['filt_strict', 'filt_lenient']),
+        expand('data/exome-point-mutations/high-vs-low-exomes.v62.ft.alldonors-{strictness}.all_filt_sites.vcf', \
+            strictness = ['filt_strict', 'filt_lenient']),
+        expand('data/simulations/{donor}.simulate.rds', \
+            donor = donors_lenient_cell_cov),
+        expand('data/variance_components/donorVar/{donor}.var_part.var1.csv' \
+            donor = donors_lenient_cell_cov)
+
+
+rule run_varpart_per_donor:
+    input:
+        sce=lambda wildcards: sce_list['filt_lenient']['cell_coverage_sites']
+    output:
+        'data/variance_components/donorVar/{donor}.var_part.var1.csv'
+    conda:
+        "envs/myenv.yaml"
+    singularity:
+        "docker://davismcc/r-singlecell-img"
+    shell:
+        'Rscript src/R/var_part_donor.R {wildcards.donor}'
+
+
+rule run_simulation_per_donor:
+    input:
+        card='data/cell_assignment/cardelino_results.{donor}.filt_lenient.cell_coverage_sites.rds'
+    output:
+        real_data='data/simulations/{donor}.filt_lenient.cell_coverage_sites.mult.rds',
+        simu_data='data/simulations/{donor}.simulate.rds'
+    conda:
+        "envs/myenv.yaml"
+    singularity:
+        "docker://davismcc/r-singlecell-img"
+    shell:
+        'Rscript src/R/simulation_per_donor.R {wildcards.donor}'
+
+
+rule run_de_pathway_analysis_unst_cells_permutation:
+    input:
+        sce=lambda wildcards: sce_list[wildcards.strictness][wildcards.sites]
+    output:
+        html='reports/de_pathway/de_pathway.unst_cells.permutations.{strictness}.{sites}.html',
+        unst_rds='data/de_analysis_FTv62/permutations/{strictness}.{sites}.de_results_unstimulated_cells.rds'
+    conda:
+        "envs/myenv.yaml"
+    singularity:
+        "docker://davismcc/r-singlecell-img"
+    shell:
+        '{rscript_cmd} src/R/compile_report_de_pathways.R '
+        '-c {wildcards.strictness}.{wildcards.sites} '
+        '-o {output.html} '
+        '--template src/Rmd/DE_pathways_FTv62_callset_clones_pairwise_vs_base.unst_cells.permutations.Rmd '
+        '--title "DE Pathway permutation analysis using unstimulated cells: {wildcards.strictness} {wildcards.sites}" '
+        '--to_working_dir ../../ '
+
+
+rule run_de_pathway_analysis_unst_cells_cellcycle:
+    input:
+        sce=lambda wildcards: sce_list[wildcards.strictness][wildcards.sites]
+    output:
+        html='reports/de_pathway/de_pathway.unst_cells.cellcycle_analyses.{strictness}.{sites}.html',
+        unst_rds='data/de_analysis_FTv62/cellcycle_analyses/{strictness}.{sites}.de_results_unstimulated_cells.cc.rds'
+    conda:
+        "envs/myenv.yaml"
+    singularity:
+        "docker://davismcc/r-singlecell-img"
+    shell:
+        'Rscript src/R/compile_report_de_pathways.R '
+        '-c {wildcards.strictness}.{wildcards.sites} '
+        '-o {output.html} '
+        '--template src/Rmd/DE_pathways_FTv62_callset_clones_pairwise_vs_base.cell_cycle.unst_cells.Rmd '
+        '--title "DE Pathway Analysis using unstimulated cells accounting for cell cycle : {wildcards.strictness} {wildcards.sites}" '
+        '--to_working_dir ../../ '
+
+
+rule run_de_pathway_analysis_unst_cells:
+    input:
+        sce=lambda wildcards: sce_list[wildcards.strictness][wildcards.sites]
+    output:
+        html='reports/de_pathway/de_pathway.unst_cells.{strictness}.{sites}.html',
+        unst_rds='data/de_analysis_FTv62/{strictness}.{sites}.de_results_unstimulated_cells.rds'
+    conda:
+        "envs/myenv.yaml"
+    singularity:
+        "docker://davismcc/r-singlecell-img"
+    shell:
+        'Rscript src/R/compile_report_de_pathways.R '
+        '-c {wildcards.strictness}.{wildcards.sites} '
+        '-o {output.html} '
+        '--template src/Rmd/DE_pathways_FTv62_callset_clones_pairwise_vs_base.unst_cells.Rmd '
+        '--title "DE Pathway Analysis using unstimulated cells: {wildcards.strictness} {wildcards.sites}" '
+        '--to_working_dir ../../ '
+
+
+rule run_cell_assignment:
+    input:
+        can='data/canopy/canopy_results.{donor}.{strictness}.{sites}.rds',
+        sce='data/sces/sce_{donor}_qc.rds',
+        vcf='data/raw/mpileup/{donor}.mpileup.vcf.gz',
+        csi='data/raw/mpileup/{donor}.mpileup.vcf.gz.csi'
+    output:
+        html = 'reports/cell_assignment/cell_assignment.{donor}.{strictness}.{sites}.html',
+        sce = 'data/sces/sce_{donor}_with_clone_assignments.{strictness}.{sites}.rds',
+        card = 'data/cell_assignment/cardelino_results.{donor}.{strictness}.{sites}.rds'
+    conda:
+        "envs/myenv.yaml"
+    singularity:
+        "docker://davismcc/r-singlecell-img"
+    shell:
+        'Rscript src/R/compile_report_cell_assign.R '
+        '-i {input.sce} --vcf_file {input.vcf} --tree_file {input.can} '
+        '-o {output.html} --results_sce {output.sce} --results_card {output.card} '
+        '--template src/Rmd/cell_assignment_template.Rmd '
+        '--title "Assigning single cells to clones: {wildcards.donor}" '
+        '--donor {wildcards.donor} --to_working_dir ../../ '
+
+
+rule run_canopy_donor_specific_coverage:
+    input:
+        'Data/exome-point-mutations/high-vs-low-exomes.v62.ft.{strictness}-{donor}.txt.gz'
+    output:
+        html = 'reports/canopy/canopy.analysis.{donor}.{strictness}.cell_coverage_sites.html',
+        rds = 'data/canopy/canopy_results.{donor}.{strictness}.cell_coverage_sites.rds'
+    conda:
+        "envs/myenv.yaml"
+    singularity:
+        "docker://davismcc/r-singlecell-img"
+    shell:
+        'Rscript '
+        'src/R/compile_report.R -i {input} -o {output.html} '
+        '--results_out {output.rds} '
+        '--template src/Rmd/canopy_analysis_template.Rmd '
+        '--title "Canopy analysis: {wildcards.donor}" '
+        '--donor {wildcards.donor} --to_working_dir ../../ '
+
+
+rule run_canopy:
+    input:
+        'Data/exome-point-mutations/high-vs-low-exomes.v62.ft.{strictness}-alldonors.txt.gz'
+    output:
+        html = 'reports/canopy/canopy.analysis.{donor}.{strictness}.all_filt_sites.html',
+        rds = 'data/canopy/canopy_results.{donor}.{strictness}.all_filt_sites.rds'
+    conda:
+        "envs/myenv.yaml"
+    singularity:
+        "docker://davismcc/r-singlecell-img"
+    shell:
+        'Rscript '
+        'src/R/compile_report.R -i {input} -o {output.html} '
+        '--results_out {output.rds} '
+        '--template src/Rmd/canopy_analysis_template.Rmd '
+        '--title "Canopy analysis: {wildcards.donor}" '
+        '--donor {wildcards.donor} --to_working_dir ../../ '
+
+
+rule filter_somatic_variants_per_donor_strict:
+    input:
+        flat='data/exome-point-mutations/high-vs-low-exomes.v62.ft.filt_strict-alldonors.txt.gz',
+        vcf='data/raw/mpileup/{donor}.mpileup.vcf.gz',
+        csi='data/raw/mpileup/{donor}.mpileup.vcf.gz.csi'
+    output:
+        'data/exome-point-mutations/high-vs-low-exomes.v62.ft.filt_strict-{donor}.txt.gz'
+    conda:
+        "envs/myenv.yaml"
+    shell:
+        'Rscript src/R/filter_variants.R -i {input.flat} -o {output} '
+        '--donor_cell_vcf {input.vcf} --max_fdr 0.2 '
+        '--min_prop_covered_cells 0.005 --donor_name {wildcards.donor}'
+
+
+rule filter_somatic_variants_per_donor_lenient:
+    input:
+        flat='data/exome-point-mutations/high-vs-low-exomes.v62.ft.filt_lenient-alldonors.txt.gz',
+        vcf='data/raw/mpileup/{donor}.mpileup.vcf.gz',
+        csi='data/raw/mpileup/{donor}.mpileup.vcf.gz.csi'
+    output:
+        'data/exome-point-mutations/high-vs-low-exomes.v62.ft.filt_lenient-{donor}.txt.gz'
+    conda:
+        "envs/myenv.yaml"
+    singularity:
+        "docker://davismcc/r-singlecell-img"
+    shell:
+        'Rscript src/R/filter_variants.R -i {input.flat} -o {output} '
+        '--donor_cell_vcf {input.vcf} --max_fdr 0.2 '
+        '--min_prop_covered_cells 0.005 --donor_name {wildcards.donor}'
+
+
+rule filter_somatic_variants_strict:
+    input:
+        'data/exome-point-mutations/high-vs-low-exomes.v62.ft.txt.gz'
+    output:
+        'data/exome-point-mutations/high-vs-low-exomes.v62.ft.filt_strict-alldonors.txt.gz'
+    conda:
+        "envs/myenv.yaml"
+    singularity:
+        "docker://davismcc/r-singlecell-img"
+    shell:
+        'Rscript src/R/filter_variants.R -i {input} -o {output} '
+        '--max_fdr 0.05 --min_vaf_fibro 0.03 --max_vaf_fibro 0.45 '
+        '--min_nalt_fibro 2.5 --max_vaf_ips 0.7 --combo_max_vaf_fibro 0.35 '
+        '--combo_max_vaf_ips 0.3'
+
+
+rule filter_somatic_variants_lenient:
+    input:
+        'data/exome-point-mutations/high-vs-low-exomes.v62.ft.txt.gz'
+    output:
+        'data/exome-point-mutations/high-vs-low-exomes.v62.ft.filt_lenient-alldonors.txt.gz'
+    conda:
+        "envs/myenv.yaml"
+    singularity:
+        "docker://davismcc/r-singlecell-img"
+    shell:
+        'Rscript src/R/filter_variants.R -i {input} -o {output} '
+        '--max_fdr 0.1 --min_vaf_fibro 0.01 --max_vaf_fibro 0.45 '
+        '--min_nalt_fibro 1.5 --max_vaf_ips 0.8 --combo_max_vaf_fibro 0.45 '
+        '--combo_max_vaf_ips 0.45'
+
diff --git a/_workflowr.yml b/_workflowr.yml
new file mode 100644
index 0000000..2e3ccb9
--- /dev/null
+++ b/_workflowr.yml
@@ -0,0 +1,10 @@
+# workflowr options
+# Version 1.1.1
+
+# The seed to use for random number generation. See ?set.seed for details.
+seed: 20190102
+# The working directory to build the R Markdown files. The path is relative to
+# _workflowr.yml. See ?rmarkdown::render for details.
+knit_root_dir: "."
+# Session information function
+sessioninfo: "devtools::session_info()"
diff --git a/analysis/_site.yml b/analysis/_site.yml
new file mode 100644
index 0000000..2b884d0
--- /dev/null
+++ b/analysis/_site.yml
@@ -0,0 +1,21 @@
+name: "aaaa-2021-Project-Template"
+output_dir: "../public"
+navbar:
+  title: "aaaa-2021-Project-Template"
+  left:
+    - text: "Home"
+      href: index.html
+    - text: "About"
+      href: about.html
+    - text: "License"
+      href: license.html
+  right:
+    - icon: fa-gitlab
+      href: https://gitlab.svi.edu.au/dna-repair-and-recombination-lab/aaaa-2021-project-template
+output:
+  workflowr::wflow_html:
+    toc: true
+    toc_float: true
+    theme: journal
+    highlight: pygments
+    code_folding: hide
diff --git a/analysis/about.Rmd b/analysis/about.Rmd
new file mode 100644
index 0000000..9180045
--- /dev/null
+++ b/analysis/about.Rmd
@@ -0,0 +1,39 @@
+---
+title: "About"
+output:
+  workflowr::wflow_html:
+    toc: false
+---
+
+## *Cardelino* : Integrating whole exomes and single-cell transcriptomes to reveal phenotypic impact of somatic variants
+
+**Key findings:**
+
+* A new approach for integrating DNA-seq and single-cell RNA-seq data to reconstruct clonal substructure and single-cell transcriptomes.
+* A new computational method to map single-cell RNA-seq profiles to clones.
+* Evidence for non-neutral evolution of clonal populations in human fibroblasts.
+* Proliferation and cell cycle pathways are commonly distorted in mutated clonal populations, with implications for cancer and ageing. 
+
+
+**Abstract**
+
+Decoding the clonal substructures of somatic tissues sheds light on cell growth, development and differentiation in health, ageing and disease. DNA-sequencing, either using bulk or using single-cell assays, has enabled the reconstruction of clonal trees from somatic variants. However, approaches to characterize phenotypic and functional variations between clones are not established. 
+
+Here we present cardelino (https://github.com/PMBio/cardelino), a computational method to assign single-cell transcriptome profiles to somatic clones using variant information contained in  single-cell RNA-seq (scRNA-seq) data. After validating our model using simulations, we apply cardelino to matched scRNA-seq and exome sequencing data from 32 human dermal fibroblast lines
+
+We identify hundreds of differentially expressed genes between cells assigned to different clones. These genes were frequently enriched for the cell cycle and pathways related to cell proliferation, and our data point to clone gene expression phenotypes that support previous work showing non-neutral somatic evolution in nominally healthy human skin cells.
+
+
+## Authors
+
+The full author list is as follows:
+
+Davis J. McCarthy<sup>1,4,\*</sup>, Raghd Rostom<sup>1,2,\*</sup>, Yuanhua Huang<sup>1,\*</sup>, Daniel J. Kunz<sup>2,5,6</sup>, Petr Danecek<sup>2</sup>, Marc Jan Bonder<sup>1</sup>, Tzachi Hagai<sup>1,2</sup>, HipSci Consortium, Wenyi Wang<sup>8</sup>, Daniel J. Gaffney<sup>2</sup>, Benjamin D. Simons<sup>5,6,7</sup>, Oliver Stegle<sup>1,3,9,#</sup>, Sarah A. Teichmann<sup>1,2,5,#</sup>
+
+<sup>1</sup>European Molecular Biology Laboratory, European Bioinformatics Institute, Wellcome Genome Campus, CB10 1SD
+Hinxton, Cambridge, UK; <sup>2</sup>Wellcome Sanger Institute, Wellcome Genome Campus, Hinxton, CB10 1SA, UK; <sup>3</sup>European Molecular Biology Laboratory, Genome Biology Unit, 69117 Heidelberg, Germany; <sup>4</sup>St Vincent’s Institute of Medical Research, Fitzroy, Victoria 3065, Australia. <sup>5</sup>Cavendish Laboratory, Department of Physics, JJ Thomson Avenue, Cambridge, CB3 0HE, UK. <sup>6</sup>The Wellcome Trust/Cancer Research UK Gurdon Institute, University of Cambridge, Cambridge, CB2 1QN, UK. <sup>7</sup>The Wellcome Trust/Medical Research Council Stem Cell Institute, University of Cambridge, Cambridge, UK. <sup>8</sup>Department of Bioinformatics and Computational Biology, The University of Texas MD Anderson Cancer Center, Houston, Texas 77030, USA. <sup>9</sup>Division of Computational Genomics and Systems Genetics, German Cancer Research Center (DKFZ), 69120, Heidelberg, Germany.
+
+<sup>*</sup> These authors contributed equally to this work.
+
+<sup>#</sup> Corresponding authors. 
+
diff --git a/analysis/index.Rmd b/analysis/index.Rmd
new file mode 100644
index 0000000..7c61d89
--- /dev/null
+++ b/analysis/index.Rmd
@@ -0,0 +1,156 @@
+---
+title: "single-cell expression QTL Workflow project"
+author: "Davis McCarthy"
+site: workflowr::wflow_site
+output:
+  workflowr::wflow_html:
+    toc: false
+---
+
+## Project overview
+
+This project developes a pipeline for eQTL analysis using single cell sequencing data from human dermal fibroblast cell populations and human induced pluripotent stem cells at different timepoints during differentiation. 
+
+**Key findings:**
+
+* A novel approach for integrating DNA-seq and single-cell RNA-seq data to 
+reconstruct clonal substructure and single-cell transcriptomes.
+* A new computational method, [cardelino](https://github.com/PMBio/cardelino), to map 
+single-cell RNA-seq profiles to clones.
+* Evidence for non-neutral evolution of clonal populations in human fibroblasts.
+* Proliferation and cell cycle pathways are commonly distorted in mutated clonal
+populations, with implications for cancer and ageing. 
+
+For a richer overview, see the [About](about.html) page.
+
+
+## Data pre-processing
+
+The data pre-processing for this project from the raw data described above is 
+complicated and computationally expensive, so this repository does not reproduce
+the data pre-processing in an automated way. However, we provide the source code
+for the [Snakemake](https://snakemake.readthedocs.io/en/stable/) workflow for 
+data pre-processing in this repository. Docker images providing the computing 
+environment and software used are publicly available, split into an image for 
+command line [bioinformatics tools](https://hub.docker.com/r/davismcc/fibroblast-clonality/)
+and an [R installation](https://hub.docker.com/r/davismcc/r-singlecell-img/) with 
+necessary packages installed. 
+
+If you would like to pre-process the data from raw reads to results as we have, 
+please consult our description of [how to run](data_preprocessing.html) the 
+workflow. 
+
+## Analyses
+
+Here we present the reproducible the results of our analyses. They were 
+generated by rendering the 
+[R Markdown documents](https://github.com/davismcc/fibroblast-clonality/tree/master/analysis) 
+into webpages available at the links below.
+
+The results presented in the paper were produced with these analyses.
+
+1. [Simulation results.](simulations.html)
+
+1. [Overview of lines.](overview_lines.html)
+
+1. [Selection models.](selection_models.html)
+
+1. [Analysis of clonal prevalences.](clone_prevalences.html)
+
+1. [Analysis for the example cell line *joxm*.](analysis_for_joxm.html)
+
+1. [Variance components analysis.](variance_components.html)
+
+1. [Differential expression analysis.](differential_expression.html)
+
+1. [Analysis of effects of somatic variants on cis gene expression.](mutated_genes.html)
+
+
+## Data availability
+
+This is a complicated project, and reproducing all of the results presented, especially from raw data is highly non-trivial. Nevertheless, we have made all data available so that everything is entirely reproducible.
+
+Single-cell RNA-seq data have been deposited in the 
+[ArrayExpress](https://www.ebi.ac.uk/arrayexpress) database at EMBL-EBI under accession 
+number [E-MTAB-7167](https://www.ebi.ac.uk/arrayexpress/experiments/E-MTAB-7167).
+Whole-exome sequencing data is available through the 
+[HipSci portal](http://www.hipsci.org). Processed data and large results files are 
+available from [Zenodo](http://doi.org/10.5281/zenodo.1403510) with DOI 10.5281/zenodo.1403510. 
+
+To set up the project to reproduce our analyses, first clone the [source code repository](https://github.com/davismcc/fibroblast-clonality) from GitHub. Next, download all of the reference, metadata and results files and add them to the (cloned) project folder with the following structure:
+
+```
+.
+├── data
+│   ├── canopy
+│   │   ├── canopy_results.*.rds
+│   ├── cell_assignment
+│   │   ├── cardelino_results.*.rds
+│   ├── de_analysis_FTv62
+│   │   ├── cellcycle_analyses
+│   │   │   ├── filt_lenient.all_filt_sites.de_results_unstimulated_cells.cc.rds
+│   │   │   ├── filt_lenient.cell_coverage_sites.de_results_unstimulated_cells.cc.rds
+│   │   │   ├── filt_strict.all_filt_sites.de_results_unstimulated_cells.cc.rds
+│   │   │   └── filt_strict.cell_coverage_sites.de_results_unstimulated_cells.cc.rds
+│   │   ├── filt_lenient.all_filt_sites.de_results_unstimulated_cells.rds
+│   │   ├── filt_lenient.cell_coverage_sites.de_results_unstimulated_cells.rds
+│   │   ├── filt_strict.all_filt_sites.de_results_unstimulated_cells.rds
+│   │   └── filt_strict.cell_coverage_sites.de_results_unstimulated_cells.rds
+│   ├── donor_info_070818.txt
+│   ├── donor_info_core.csv
+│   ├── donor_neutrality.tsv
+│   ├── exome-point-mutations
+│   │   ├── high-vs-low-exomes.v62.ft.alldonors-filt_lenient.all_filt_sites.vep_most_severe_csq.txt
+│   │   └── high-vs-low-exomes.v62.ft.filt_lenient-alldonors.txt.gz
+│   ├── human_H_v5p2.rdata
+│   ├── human_c2_v5p2.rdata
+│   ├── human_c6_v5p2.rdata
+│   ├── neg-bin-rsquared-petr.csv
+│   ├── neutralitytestr-petr.tsv
+|   ├── sces
+│   │   ├── sce_*.rds
+│   ├── selection
+│   │   ├── neg-bin-params-fit.csv
+│   │   ├── neg-bin-rsquared-fit.csv
+│   ├── simulations
+│   │   ├── *.filt_lenient.cell_coverage_sites.mult.rds
+│   │   ├── *.simulate.rds
+│   └── variance_components
+│       ├── covar_all.csv
+│       ├── donorVar
+│       │   ├── *.var_part.var1.csv
+│       ├── fit_all_gene_highVar.csv
+│       ├── fit_per_gene_highVar.csv
+│       ├── gene_info_all.csv
+│       └── logcnt_all.csv
+├── metadata
+│   ├── cell_metadata.csv
+│   └── data_processing_metadata.tsv
+├── references
+│   ├── 1000G_phase1.indels.hg19.sites.vcf.gz
+│   ├── GRCh37.p13.genome.ERCC92.fa
+│   ├── Homo_sapiens.GRCh37.rel75.cdna.all.ERCC92.fa.gz
+│   ├── Mills_and_1000G_gold_standard.indels.hg19.sites.vcf.gz
+│   ├── dbsnp_138.hg19.biallelicSNPs.HumanCoreExome12.Top1000ExpressedIpsGenes.Maf0.01.HWE0.0001.HipSci.vcf.gz
+│   ├── dbsnp_138.hg19.vcf.gz
+│   ├── gencode.v19.annotation_ERCC.gtf
+│   ├── hipsci.wec.gtarray.HumanCoreExome.imputed_phased.20170327.genotypes.allchr.fibro_samples_v2_filt_vars_sorted_oa.vcf.gz
+│   ├── hipsci.wec.gtarray.HumanCoreExome.imputed_phased.20170327.genotypes.allchr.fibro_samples_v2_filt_vars_sorted_oa.vcf.gz.csi
+│   └── knownIndels.intervals
+```
+
+For simplicity, we ignore all the directories and files present in the source code repository (that you should have clones) to focus just on where you should _add_ the files downloaded from Zenodo. Yes, it's still complicated, but such is life.
+
+There is a large number of `canopy_results.*.rds` files: these should be stored in the `data/canopy` directory. Similarly, all of the `cardelino_results.*.rds` files should be stored in `data/cell_assignment`. All of the SingleCellExperiment object files (`sce_*.rds`) should be stored in `data/sces`. Simulation results files (`*.mult.rds`; `*.simulate.rds`) should be stored in `data/simulations`. Variance components results should be stored in `data/variance_components` as shown above.
+
+Differential expression results belong in `data/de_analysis_FTv62`.
+
+Metadata files belong in `metadata`. Reference files belong in `references`.
+
+With the data downloaded and organised as above, you will be able to reproduce the analyses presented in the RMarkdown files linked to above and, if desired, even run the whole analysis pipeline from raw reads to results following [these instructions](data_preprocessing.html).
+
+
+-------
+
+<a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/"><img alt="Creative Commons License" style="border-width:0" src="https://i.creativecommons.org/l/by-sa/4.0/88x31.png" /></a><br />This work is licensed under a <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">Creative Commons Attribution-ShareAlike 4.0 International License</a>.
+
diff --git a/analysis/license.Rmd b/analysis/license.Rmd
new file mode 100644
index 0000000..16ab8ba
--- /dev/null
+++ b/analysis/license.Rmd
@@ -0,0 +1,15 @@
+---
+title: "License"
+output:
+  workflowr::wflow_html:
+    toc: false
+---
+
+All source code, software and outputs in this repository are made available 
+under the terms of the [Creative Commons Attribution-ShareAlike 4.0 International](https://creativecommons.org/licenses/by-sa/4.0/legalcode) 
+license.
+
+Input data are available.
+
+To cite this work, please use...
+
diff --git a/cluster.json b/cluster.json
new file mode 100644
index 0000000..13a79f3
--- /dev/null
+++ b/cluster.json
@@ -0,0 +1,11 @@
+{
+    "__default__" : 
+    {
+        "memory" : "8000",
+        "n" : 1,
+        "queue" : "research",
+        "name" : "EUUI_2019_sceQTL-Workflow.{rule}.{wildcards}",
+        "output" : "logs/{rule}.out",
+        "error" : "logs/{rule}.err"
+    }
+}
diff --git a/data/README.md b/data/README.md
new file mode 100644
index 0000000..2414f4b
--- /dev/null
+++ b/data/README.md
@@ -0,0 +1,6 @@
+# Data
+
+Save raw data files here.
+
+Note: raw data files saved in scratch directory to save space
+
diff --git a/environment.yml b/environment.yml
new file mode 100644
index 0000000..4ec272a
--- /dev/null
+++ b/environment.yml
@@ -0,0 +1,25 @@
+name: aaaa-2021-Project-Template
+channels:
+  - defaults
+  - bioconda
+  - conda-forge
+dependencies:
+  - conda-forge::openjdk=8.0.144  # Needed for FastQC docker - see bioconda/bioconda-recipes#5026
+  - fastqc=0.11.7
+  - multiqc=1.5
+  - picard=2.18.4
+  - bcftools=1.8
+  - vcftools=0.1.16
+  - perl-vcftools-vcf=0.1.16
+  - salmon=0.8.2
+  - star=2.6.0b
+  - bedops=2.4.30
+  - cutadapt=1.15
+  - trim-galore=0.4.5
+  - subread=1.6.0
+  - samtools=1.8
+  - tabix=0.2.5
+  - hisat2=2.1.0
+  - rseqc=2.6.4
+  - preseq=2.0.2
+  - gatk=3.8
diff --git a/envs/myenv.yaml b/envs/myenv.yaml
new file mode 100644
index 0000000..490f506
--- /dev/null
+++ b/envs/myenv.yaml
@@ -0,0 +1,25 @@
+name: AAAA_2019_Project-Template
+channels:
+  - defaults
+  - bioconda
+  - conda-forge
+dependencies:
+  - conda-forge::openjdk=8.0.144  # Needed for FastQC docker - see bioconda/bioconda-recipes#5026
+  - fastqc=0.11.7
+  - multiqc=1.5
+  - picard=2.18.4
+  - bcftools=1.8
+  - vcftools=0.1.16
+  - perl-vcftools-vcf=0.1.16
+  - salmon=0.8.2
+  - star=2.6.0b
+  - bedops=2.4.30
+  - cutadapt=1.15
+  - trim-galore=0.4.5
+  - subread=1.6.0
+  - samtools=1.8
+  - tabix=0.2.5
+  - hisat2=2.1.0
+  - rseqc=2.6.4
+  - preseq=2.0.2
+  - gatk=3.8
diff --git a/org/README.md b/org/README.md
new file mode 100644
index 0000000..a7990ee
--- /dev/null
+++ b/org/README.md
@@ -0,0 +1,3 @@
+# Output
+
+Save project management and organisational files here.
diff --git a/org/project_management.org b/org/project_management.org
new file mode 100644
index 0000000..18c0413
--- /dev/null
+++ b/org/project_management.org
@@ -0,0 +1,34 @@
+#+TITLE: AAAA_2019_Project-Template
+
+* TODO Organise roadmap for project [2019-2-2 Sat]
+
+* Kanban 
+
+| Backlog         | Waiting On | Planned | Doing | Done |
+|-----------------+------------+---------+-------+------|
+| [[Update template]] |            |         |       |      |
+| [[Reproducibility]] |            |         |       |      |
+|                 |            |         |       |      |
+  
+
+* [#C] Update template <<Update template>> [0%]
+
+** TODO Specify the way to cite the project in the `CITATION` file. 
+ This will likely need updating over the course of the project.
+- [ ] Update in `analysis/license.Rmd` too
+
+** TODO Check that the `LICENSE` file is appropriate.
+ Unless there is good reason to prefer something different, we strongly prefer
+ open, permissive licenses to make our work as widely accessible and useable as
+ possible.
+- [ ] Update in `analysis/license.Rmd` too
+
+
+* [#B] Make project reproducible <<Reproducibility>>
+
+** TODO Rewrite `Snakefile` file that defines the Snakemake workflow
+The included file in this template provides a useful starting point and some
+handy patterns to exploit.
+
+
+
diff --git a/output/README.md b/output/README.md
new file mode 100644
index 0000000..fa9512c
--- /dev/null
+++ b/output/README.md
@@ -0,0 +1,3 @@
+# Output
+
+Save processed data files here.
diff --git a/references/README.md b/references/README.md
new file mode 100644
index 0000000..356fb06
--- /dev/null
+++ b/references/README.md
@@ -0,0 +1,3 @@
+# References
+
+Save reference data files (e.g. genome/transcriptome fasta files, gnomAD/1000 Genomes VCFs, etc) here.
diff --git a/resources/README.md b/resources/README.md
new file mode 100644
index 0000000..68c7187
--- /dev/null
+++ b/resources/README.md
@@ -0,0 +1,3 @@
+# Resources
+
+Save useful resources (papers, notes, etc.) files here.
diff --git a/source/README.md b/source/README.md
new file mode 100644
index 0000000..4c2402f
--- /dev/null
+++ b/source/README.md
@@ -0,0 +1,3 @@
+# Code
+
+Save command-line scripts and shared R code here.
-- 
GitLab