From da39f1fb26d11dfed08a5258c0cb7de29d60912f Mon Sep 17 00:00:00 2001 From: Cedric Midoux <cedric.midoux@inra.fr> Date: Fri, 6 Sep 2019 17:07:44 +0200 Subject: [PATCH] cdhit catalogue.ffn --- annotation.smk | 28 ++++++++--------- catalogue.smk | 82 ++++++++++++++++++++++++++++++++++++++++++++++++++ global.smk | 2 ++ 3 files changed, 98 insertions(+), 14 deletions(-) create mode 100644 catalogue.smk diff --git a/annotation.smk b/annotation.smk index 0e203c2..c92e44c 100644 --- a/annotation.smk +++ b/annotation.smk @@ -2,10 +2,10 @@ rule FragGeneScan: input: unpack(contigs_input) output: - faa = "work/FGS/{sample}_FGS.faa", - ffn = "work/FGS/{sample}_FGS.ffn", - gff = "work/FGS/{sample}_FGS.gff", - out = temp("work/FGS/{sample}_FGS.out") + faa = "work/FGS/{sample}/{sample}_FGS.faa", + ffn = "work/FGS/{sample}/{sample}_FGS.ffn", + gff = "work/FGS/{sample}/{sample}_FGS.gff", + out = temp("work/FGS/{sample}/{sample}_FGS.out") threads: 4 params: @@ -24,9 +24,9 @@ rule prodigal: input: unpack(contigs_input) output: - faa = "work/prodigal/{sample}_prodigal.faa", - ffn = "work/prodigal/{sample}_prodigal.ffn", - gff = "work/prodigal/{sample}_prodigal.gff" + faa = "work/prodigal/{sample}/{sample}_prodigal.faa", + ffn = "work/prodigal/{sample}/{sample}_prodigal.ffn", + gff = "work/prodigal/{sample}/{sample}_prodigal.gff" shell: "prodigal " "-f gff " @@ -41,21 +41,21 @@ rule prodigal: def faa_input(wildcards): if config["PROTEINS-PREDICTOR"] == "FragGeneScan": - return {"faa": "work/FGS/{wildcards.sample}_FGS.faa".format(wildcards=wildcards)} + return {"faa": "work/FGS/{wildcards.sample}/{wildcards.sample}_FGS.faa".format(wildcards=wildcards)} elif config["PROTEINS-PREDICTOR"] == "prodigal": - return {"faa": "work/prodigal/{wildcards.sample}_prodigal.faa".format(wildcards=wildcards)} + return {"faa": "work/prodigal/{wildcards.sample}/{wildcards.sample}_prodigal.faa".format(wildcards=wildcards)} def ffn_input(wildcards): if config["PROTEINS-PREDICTOR"] == "FragGeneScan": - return {"fnn": "work/FGS/{wildcards.sample}_FGS.ffn".format(wildcards=wildcards)} + return {"fnn": "work/FGS/{wildcards.sample}/{wildcards.sample}_FGS.ffn".format(wildcards=wildcards)} elif config["PROTEINS-PREDICTOR"] == "prodigal": - return {"fnn": "work/prodigal/{wildcards.sample}_prodigal.ffn".format(wildcards=wildcards)} + return {"fnn": "work/prodigal/{wildcards.sample}/{wildcards.sample}_prodigal.ffn".format(wildcards=wildcards)} def gff_input(wildcards): if config["PROTEINS-PREDICTOR"] == "FragGeneScan": - return {"gff": "work/FGS/{wildcards.sample}_FGS.gff".format(wildcards=wildcards)} + return {"gff": "work/FGS/{wildcards.sample}/{wildcards.sample}_FGS.gff".format(wildcards=wildcards)} elif config["PROTEINS-PREDICTOR"] == "prodigal": - return {"gff": "work/prodigal/{wildcards.sample}_prodigal.gff".format(wildcards=wildcards)} + return {"gff": "work/prodigal/{wildcards.sample}/{wildcards.sample}_prodigal.gff".format(wildcards=wildcards)} def db_input(wildcards): if wildcards.db == "nr": @@ -156,4 +156,4 @@ rule interproscan: "--formats tsv,gff3,html " "--iprlookup " "--goterms " - "--pathways " \ No newline at end of file + "--pathways " diff --git a/catalogue.smk b/catalogue.smk new file mode 100644 index 0000000..7ff48a1 --- /dev/null +++ b/catalogue.smk @@ -0,0 +1,82 @@ +rule partial_genes: #fonctionne uniquement avec prodigal + input: + "work/prodigal/{sample}/{sample}_prodigal.ffn" + output: + "work/prodigal/{sample}/{sample}_partial_{x}.ffn" + threads: + 2 + shell: + "source activate seqkit-0.10.1 " + " ; " + "seqkit " + "grep " + "--use-regexp " + "--by-name " + "--pattern ';partial={wildcards.x};' " + "--threads {threads} " + "--out-file {output} " + "{input} " + " ; " + "conda deactivate" + +rule pool: + input: + expand("work/prodigal/{sample}/{sample}_partial_{{x}}.ffn", sample=config["SAMPLES"]) + output: + temp("work/prodigal/pool/partial_{x}.ffn") + shell: + "cat " + "{input} " + " > " + "{output}" + +rule pool_incomplet: + input: + expand("work/cdhit/non_redundant_parial_{x}.ffn", x=["01", "10"]) + output: + temp("work/cdhit/non_redundant_parial_10-01.ffn") + shell: + "cat " + "{input} " + " > " + "{output}" + +rule cd_hit: + input: + "work/prodigal/pool/partial_{x}.ffn" + output: + ffn = "work/cdhit/non_redundant_parial_{x}.ffn", + clustr = "work/cdhit/non_redundant_parial_{x}.ffn.clustr" + threads: + 8 + shell: + "cd-hit-est " + "-i {input} " + "-o {output.ffn} " + "-c 0.95 " + "-G 1 " + "-aS 0.90 " + "-d 0 " + "-M 0 " + "-T {threads} " + +rule cd_hit_2D: + input: + complete = "work/cdhit/non_redundant_parial_00.ffn", + incomplete = "work/cdhit/non_redundant_parial_10-01.ffn" + output: + ffn = "work/cdhit/catalogue.ffn", + clustr = "work/cdhit/catalogue.clustr" + threads: + 8 + shell: + "cd-hit-est-2d " + "-i {input.complete} " + "-i2 {input.incomplete} " + "-o {output.ffn} " + "-c 0.95 " + "-aS 0.90 " + "-g 1 " + "-d 0 " + "-M 0 " + "-T {threads} " diff --git a/global.smk b/global.smk index 09b6633..38d01bd 100644 --- a/global.smk +++ b/global.smk @@ -16,6 +16,7 @@ rule all: "report/diamond_nr_coassembly.tsv", "work/prokka/coassembly_prokka.tsv", "work/interproscan/coassembly.faa.tsv", + "work/cdhit/catalogue.ffn", include: "../workflow_metagenomics/quality.smk" @@ -24,3 +25,4 @@ include: "../workflow_metagenomics/kaiju.smk" include: "../workflow_metagenomics/assembly.smk" include: "../workflow_metagenomics/annotation.smk" include: "../workflow_metagenomics/count.smk" +include: "../workflow_metagenomics/catalogue.smk" -- GitLab