From da39f1fb26d11dfed08a5258c0cb7de29d60912f Mon Sep 17 00:00:00 2001
From: Cedric Midoux <cedric.midoux@inra.fr>
Date: Fri, 6 Sep 2019 17:07:44 +0200
Subject: [PATCH] cdhit catalogue.ffn

---
 annotation.smk | 28 ++++++++---------
 catalogue.smk  | 82 ++++++++++++++++++++++++++++++++++++++++++++++++++
 global.smk     |  2 ++
 3 files changed, 98 insertions(+), 14 deletions(-)
 create mode 100644 catalogue.smk

diff --git a/annotation.smk b/annotation.smk
index 0e203c2..c92e44c 100644
--- a/annotation.smk
+++ b/annotation.smk
@@ -2,10 +2,10 @@ rule FragGeneScan:
 	input:
 		unpack(contigs_input)
 	output:
-		faa = "work/FGS/{sample}_FGS.faa",
-		ffn = "work/FGS/{sample}_FGS.ffn",
-		gff = "work/FGS/{sample}_FGS.gff",
-		out = temp("work/FGS/{sample}_FGS.out")
+		faa = "work/FGS/{sample}/{sample}_FGS.faa",
+		ffn = "work/FGS/{sample}/{sample}_FGS.ffn",
+		gff = "work/FGS/{sample}/{sample}_FGS.gff",
+		out = temp("work/FGS/{sample}/{sample}_FGS.out")
 	threads:
 		4
 	params:
@@ -24,9 +24,9 @@ rule prodigal:
 	input:
 		unpack(contigs_input)
 	output:
-		faa = "work/prodigal/{sample}_prodigal.faa",
-		ffn = "work/prodigal/{sample}_prodigal.ffn",
-		gff = "work/prodigal/{sample}_prodigal.gff"
+		faa = "work/prodigal/{sample}/{sample}_prodigal.faa",
+		ffn = "work/prodigal/{sample}/{sample}_prodigal.ffn",
+		gff = "work/prodigal/{sample}/{sample}_prodigal.gff"
 	shell:
 		"prodigal "
 		"-f gff "
@@ -41,21 +41,21 @@ rule prodigal:
 
 def faa_input(wildcards):
 	if config["PROTEINS-PREDICTOR"] == "FragGeneScan":
-		return {"faa": "work/FGS/{wildcards.sample}_FGS.faa".format(wildcards=wildcards)}
+		return {"faa": "work/FGS/{wildcards.sample}/{wildcards.sample}_FGS.faa".format(wildcards=wildcards)}
 	elif config["PROTEINS-PREDICTOR"] == "prodigal":
-		return {"faa": "work/prodigal/{wildcards.sample}_prodigal.faa".format(wildcards=wildcards)}
+		return {"faa": "work/prodigal/{wildcards.sample}/{wildcards.sample}_prodigal.faa".format(wildcards=wildcards)}
 
 def ffn_input(wildcards):
 	if config["PROTEINS-PREDICTOR"] == "FragGeneScan":
-		return {"fnn": "work/FGS/{wildcards.sample}_FGS.ffn".format(wildcards=wildcards)}
+		return {"fnn": "work/FGS/{wildcards.sample}/{wildcards.sample}_FGS.ffn".format(wildcards=wildcards)}
 	elif config["PROTEINS-PREDICTOR"] == "prodigal":
-		return {"fnn": "work/prodigal/{wildcards.sample}_prodigal.ffn".format(wildcards=wildcards)}
+		return {"fnn": "work/prodigal/{wildcards.sample}/{wildcards.sample}_prodigal.ffn".format(wildcards=wildcards)}
 
 def gff_input(wildcards):
 	if config["PROTEINS-PREDICTOR"] == "FragGeneScan":
-		return {"gff": "work/FGS/{wildcards.sample}_FGS.gff".format(wildcards=wildcards)}
+		return {"gff": "work/FGS/{wildcards.sample}/{wildcards.sample}_FGS.gff".format(wildcards=wildcards)}
 	elif config["PROTEINS-PREDICTOR"] == "prodigal":
-		return {"gff": "work/prodigal/{wildcards.sample}_prodigal.gff".format(wildcards=wildcards)}
+		return {"gff": "work/prodigal/{wildcards.sample}/{wildcards.sample}_prodigal.gff".format(wildcards=wildcards)}
 
 def db_input(wildcards):
 	if wildcards.db == "nr":
@@ -156,4 +156,4 @@ rule interproscan:
 		"--formats tsv,gff3,html "
 		"--iprlookup "
 		"--goterms "
-		"--pathways "
\ No newline at end of file
+		"--pathways "
diff --git a/catalogue.smk b/catalogue.smk
new file mode 100644
index 0000000..7ff48a1
--- /dev/null
+++ b/catalogue.smk
@@ -0,0 +1,82 @@
+rule partial_genes: #fonctionne uniquement avec prodigal
+	input:
+		"work/prodigal/{sample}/{sample}_prodigal.ffn"
+	output:
+		"work/prodigal/{sample}/{sample}_partial_{x}.ffn"
+	threads:
+		2
+	shell:
+		"source activate seqkit-0.10.1 "
+		" ; "
+		"seqkit "
+		"grep "
+		"--use-regexp "
+		"--by-name "
+		"--pattern ';partial={wildcards.x};' "
+		"--threads {threads} "
+		"--out-file {output} "
+		"{input} "
+		" ; "
+		"conda deactivate"
+
+rule pool:
+	input:
+		expand("work/prodigal/{sample}/{sample}_partial_{{x}}.ffn", sample=config["SAMPLES"])
+	output:
+		temp("work/prodigal/pool/partial_{x}.ffn")
+	shell:
+		"cat "
+		"{input} "
+		" > "
+		"{output}"
+
+rule pool_incomplet:
+	input:
+		expand("work/cdhit/non_redundant_parial_{x}.ffn", x=["01", "10"])
+	output:
+		temp("work/cdhit/non_redundant_parial_10-01.ffn")
+	shell:
+		"cat "
+		"{input} "
+		" > "
+		"{output}"
+
+rule cd_hit:
+	input:
+		"work/prodigal/pool/partial_{x}.ffn"
+	output:
+		ffn = "work/cdhit/non_redundant_parial_{x}.ffn",
+		clustr = "work/cdhit/non_redundant_parial_{x}.ffn.clustr"
+	threads:
+		8
+	shell:
+		"cd-hit-est "
+		"-i {input} "
+		"-o {output.ffn} "
+		"-c 0.95 "
+		"-G 1 "
+		"-aS 0.90 "
+		"-d 0 "
+		"-M 0 "
+		"-T {threads} "
+
+rule cd_hit_2D:
+	input:
+		complete = "work/cdhit/non_redundant_parial_00.ffn",
+		incomplete = "work/cdhit/non_redundant_parial_10-01.ffn"
+	output:
+		ffn = "work/cdhit/catalogue.ffn",
+		clustr = "work/cdhit/catalogue.clustr"
+	threads:
+		8
+	shell:
+		"cd-hit-est-2d "
+		"-i {input.complete} "
+		"-i2 {input.incomplete} "
+		"-o {output.ffn} "
+		"-c 0.95 "
+		"-aS 0.90 "
+		"-g 1 "
+		"-d 0 "
+		"-M 0 "
+		"-T {threads} "
diff --git a/global.smk b/global.smk
index 09b6633..38d01bd 100644
--- a/global.smk
+++ b/global.smk
@@ -16,6 +16,7 @@ rule all:
 		"report/diamond_nr_coassembly.tsv",
 		"work/prokka/coassembly_prokka.tsv",
 		"work/interproscan/coassembly.faa.tsv",
+		"work/cdhit/catalogue.ffn",
 
 
 include: "../workflow_metagenomics/quality.smk"
@@ -24,3 +25,4 @@ include: "../workflow_metagenomics/kaiju.smk"
 include: "../workflow_metagenomics/assembly.smk"
 include: "../workflow_metagenomics/annotation.smk"
 include: "../workflow_metagenomics/count.smk"
+include: "../workflow_metagenomics/catalogue.smk"
-- 
GitLab