diff --git a/modules/cd_hit.nf b/modules/cd_hit.nf index af6913c1a1073752f685e8c843e0a0b847efb057..d1e72527f3e023033cc252dab4ca0897d491449b 100644 --- a/modules/cd_hit.nf +++ b/modules/cd_hit.nf @@ -30,7 +30,7 @@ process GLOBAL_CD_HIT { val pct_id output: - path "All-cd-hit-est.${pct_id}.fasta" + path "All-cd-hit-est.${pct_id}.fasta", emit: fasta_clusters path "table_clstr.txt", emit: clstr_table @@ -44,6 +44,38 @@ process GLOBAL_CD_HIT { } +// Global clustering with CD-HIT. +process CHECK_METRICS { + label 'CD_HIT' + + input: + path ech_fastas + path table_clstr + path fasta_cluster + + script: + """ + cat ${ech_fastas} | grep -c ">" > nb_inputed_seq_in_fasta.txt + grep -c ">" All-cd-hit-est.0.95.fasta > nb_clusters_in_fasta.txt + cut -f1 table_clstr.txt | sort -u | wc -l > nb_clusters.txt + cat table_clstr.txt | wc -l > nb_seq_clustered.txt + + DIFF=\$(diff nb_clusters.txt nb_clusters_in_fasta.txt) + if [ "\$DIFF" != "" ] + then + echo "Error: nb cluster after cdhit (individual + global) not consistant [table VS fasta]" + exit 1 + fi + + DIFF=\$(diff nb_seq_clustered.txt nb_inputed_seq_in_fasta.txt) + if [ "\$DIFF" != "" ] + then + echo "Error: nb contigs after cdhit (individual + global) not consistant [table VS fasta]" + exit 1 + fi + """ + +} workflow CD_HIT { @@ -53,14 +85,13 @@ ch_percentage_identity // channel: val main: INDIVIDUAL_CD_HIT( ch_assembly, ch_percentage_identity ) - ch_individual_clusters = INDIVIDUAL_CD_HIT.out.clstr_fasta.collect() - GLOBAL_CD_HIT(ch_individual_clusters , ch_percentage_identity ) + ch_ffn = ch_assembly.flatMap{it -> it[1]}.collect() + CHECK_METRICS(ch_ffn, GLOBAL_CD_HIT.out.clstr_table , GLOBAL_CD_HIT.out.fasta_clusters) emit: individual_clstr_table = INDIVIDUAL_CD_HIT.out.clstr_table global_clstr_table = GLOBAL_CD_HIT.out.clstr_table -} - +} \ No newline at end of file