# DIGESTOMIC ## link to DATA ```bash ln -s /projet/maiage/save/cmidoux/DIGESTOMIC/DATA/DIGESTOMIC_J177_A_02_03_16_S3_all_R1_001.fastq.gz DATA/raw/J177_A_R1.fastq.gz ln -s /projet/maiage/save/cmidoux/DIGESTOMIC/DATA/DIGESTOMIC_J177_A_02_03_16_S3_all_R2_001.fastq.gz DATA/raw/J177_A_R2.fastq.gz ln -s /projet/maiage/save/cmidoux/DIGESTOMIC/DATA/DIGESTOMIC_J177_B_02_03_16_S4_all_R1_001.fastq.gz DATA/raw/J177_B_R1.fastq.gz ln -s /projet/maiage/save/cmidoux/DIGESTOMIC/DATA/DIGESTOMIC_J177_B_02_03_16_S4_all_R2_001.fastq.gz DATA/raw/J177_B_R2.fastq.gz ln -s /projet/maiage/save/cmidoux/DIGESTOMIC/DATA/DIGESTOMIC_J218_A_120416_S5_all_R1_001.fastq.gz DATA/raw/J218_A_R1.fastq.gz ln -s /projet/maiage/save/cmidoux/DIGESTOMIC/DATA/DIGESTOMIC_J218_A_120416_S5_all_R2_001.fastq.gz DATA/raw/J218_A_R2.fastq.gz ln -s /projet/maiage/save/cmidoux/DIGESTOMIC/DATA/DIGESTOMIC_J218_B_120416_S6_all_R1_001.fastq.gz DATA/raw/J218_B_R1.fastq.gz ln -s /projet/maiage/save/cmidoux/DIGESTOMIC/DATA/DIGESTOMIC_J218_B_120416_S6_all_R2_001.fastq.gz DATA/raw/J218_B_R2.fastq.gz ln -s /projet/maiage/save/cmidoux/DIGESTOMIC/DATA/DIGESTOMIC_J281_A_140616_S1_all_R1_001.fastq.gz DATA/raw/J281_A_R1.fastq.gz ln -s /projet/maiage/save/cmidoux/DIGESTOMIC/DATA/DIGESTOMIC_J281_A_140616_S1_all_R2_001.fastq.gz DATA/raw/J281_A_R2.fastq.gz ln -s /projet/maiage/save/cmidoux/DIGESTOMIC/DATA/DIGESTOMIC_J281_B_140616_S2_all_R1_001.fastq.gz DATA/raw/J281_B_R1.fastq.gz ln -s /projet/maiage/save/cmidoux/DIGESTOMIC/DATA/DIGESTOMIC_J281_B_140616_S2_all_R2_001.fastq.gz DATA/raw/J281_B_R2.fastq.gz ln -s /projet/maiage/save/cmidoux/DIGESTOMIC/DATA/DIGESTOMIC_J462_A_121216_S3_all_R1_001.fastq.gz DATA/raw/J462_A_R1.fastq.gz ln -s /projet/maiage/save/cmidoux/DIGESTOMIC/DATA/DIGESTOMIC_J462_A_121216_S3_all_R2_001.fastq.gz DATA/raw/J462_A_R2.fastq.gz ln -s /projet/maiage/save/cmidoux/DIGESTOMIC/DATA/DIGESTOMIC_J462_B_121216_S4_all_R1_001.fastq.gz DATA/raw/J462_B_R1.fastq.gz ln -s /projet/maiage/save/cmidoux/DIGESTOMIC/DATA/DIGESTOMIC_J462_B_121216_S4_all_R2_001.fastq.gz DATA/raw/J462_B_R2.fastq.gz ln -s /projet/maiage/save/cmidoux/DIGESTOMIC/DATA/DIGESTOMIC_J80_A_26_11_15_S1_all_R1_001.fastq.gz DATA/raw/J80_A_R1.fastq.gz ln -s /projet/maiage/save/cmidoux/DIGESTOMIC/DATA/DIGESTOMIC_J80_A_26_11_15_S1_all_R2_001.fastq.gz DATA/raw/J80_A_R2.fastq.gz ln -s /projet/maiage/save/cmidoux/DIGESTOMIC/DATA/DIGESTOMIC_J80_B_26_11_15_S2_all_R1_001.fastq.gz DATA/raw/J80_B_R1.fastq.gz ln -s /projet/maiage/save/cmidoux/DIGESTOMIC/DATA/DIGESTOMIC_J80_B_26_11_15_S2_all_R2_001.fastq.gz DATA/raw/J80_B_R2.fastq.gz ``` ## ghostKOALA ### Submit analysis ```bash conda activate seqkit-0.10.1 seqkit split2 --by-size 1000000 --out-dir work/FGS/split work/FGS/coassembly_FGS.faa ``` * Download files * Upload on [GhostKOALA webserver](https://www.kegg.jp/ghostkoala/) with `genus_prokaryotes + family_eukaryotes + viruses ` database option * Submit * Download from webserver and group in order annotation files `user_ko.txt` * Join with other tables ### KEGG orthology table [source](http://merenlab.org/2018/01/17/importing-ghostkoala-annotations/) ```bash cd work/ghostKOALA wget 'https://www.genome.jp/kegg-bin/download_htext?htext=ko00001&format=htext&filedir=' -O ko00001.keg kegfile="ko00001.keg" while read -r prefix content do case "$prefix" in A) col1="$content";; \ B) col2="$content" ;; \ C) col3="$content";; \ D) echo -e "$col1\t$col2\t$col3\t$content";; esac done < <(sed '/^[#!+]/d;s/<[^>]*>//g;s/^./& /' < "$kegfile") > KO_Orthology_ko00001.txt rm ko00001.keg cut -f4 KO_Orthology_ko00001.txt | sort -n | uniq | sed 's/ /\t/' > KO_name.txt ``` ### Combine duplicates lines with comma ```python import csv import os os.system("sort -k1,1 user_ko.txt > user_ko-sort.txt") with open("user_ko-sort.txt", "r") as f: with open("user_ko-uniq.txt", "w") as out: reader=csv.reader(f, delimiter='\t') old_gene="" old_ko="" for row in reader: gene=row[0] try: ko=row[1] except IndexError: ko="NA" if(gene==old_gene): old_ko=old_ko+","+ko else: out.write("%s\t%s\n"%(old_gene, old_ko)) old_gene=gene old_ko=ko out.write("%s\t%s\n"%(gene, ko)) ``` ## Final table ```bash join -t $'\t' report/count-genes_FGS-J80_A-coassembly.tsv report/count-genes_FGS-J80_B-coassembly.tsv | \ join -t $'\t' - report/count-genes_FGS-J177_A-coassembly.tsv | \ join -t $'\t' - report/count-genes_FGS-J177_B-coassembly.tsv | \ join -t $'\t' - report/count-genes_FGS-J218_A-coassembly.tsv | \ join -t $'\t' - report/count-genes_FGS-J218_B-coassembly.tsv | \ join -t $'\t' - report/count-genes_FGS-J281_A-coassembly.tsv | \ join -t $'\t' - report/count-genes_FGS-J281_B-coassembly.tsv | \ join -t $'\t' - report/count-genes_FGS-J462_A-coassembly.tsv | \ join -t $'\t' - report/count-genes_FGS-J462_B-coassembly.tsv | \ sed 's/__/0_/' | \ sort -k1,1 \ > Table/count-genes_coassembly.tsv awk -F $'\t' -v OFS=$'\t' '{if($1=="C") print $2,$3,$4 ; else print $2,$3,"NA; NA; NA; NA; NA; NA; NA;"}' report/genes_coassembly-taxNames.tsv | \ sort -k1,1 \ > Table/taxName_coassembly.tsv sort -k2,2 work/ghostKOALA/user_ko-uniq.txt | \ join --check-order -t $'\t' -e NA -a1 -12 -21 -o 1.1,0,2.2 - work/ghostKOALA/KO_name.txt | \ sort -k1,1 \ > Table/KO_coassembly.tsv echo -e "gene\tJ80_A\tJ80_B\tJ177_A\tJ177_B\tJ218_A\tJ218_B\tJ281_A\tJ281_B\tJ462_A\tJ462_B\ttaxID_kaijuNR\ttaxID_name\tKO_ghostKOALA\tKO_name"> Table/digestomics.tsv join --check-order -t $'\t' -e NA -a1 Table/count-genes_coassembly.tsv Table/taxName_coassembly.tsv | \ join --check-order -t $'\t' -e NA -a1 - Table/KO_coassembly.tsv \ >> Table/digestomics.tsv ```