diff --git a/assets/hifi_multiqc_config.yaml b/assets/hifi_multiqc_config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c90beaa232e572fe9c861ca2133952ea4f95c61d --- /dev/null +++ b/assets/hifi_multiqc_config.yaml @@ -0,0 +1,92 @@ +report_comment: > + This report has been generated by the <a href="https://forgemia.inra.fr/genotoul-bioinfo/metagwgs" target="_blank">genotoul-bioinfo/metagwgs</a> + analysis pipeline. For information about how to interpret these results, please see the + <a href="https://forgemia.inra.fr/genotoul-bioinfo/metagwgs" target="_blank">documentation</a>. + +extra_fn_clean_trim: + - "hifi_" + - 'reads_' + - '.count_reads_on_contigs' + - '_scaffolds' + - '.txt' + - '.contigs' + - '.sort' + - "cleaned_" + - "raw_" + - '_kept_contigs' + - '.no_filter' + - '_kaiju_MEM_verbose' + +extra_fn_clean_exts: + - "_select_contigs_cpm" + - '.host_filter' + +module_order: + - fastqc: + name: 'FastQC (raw)' + path_filters_exclude: + - '*cleaned_*.zip' + - samtools: + name : 'Reads before host reads filter' + info: 'This section reports of the reads alignement against host genome.' + path_filters: + - '*.no_filter.flagstat' + - samtools: + name : 'Reads after host reads filter' + info: 'This section reports of the cleaned reads alignement against host genome.' + path_filters: + - '*.host_filter.flagstat' + - fastqc: + name: 'FastQC (cleaned)' + info: 'This section of the report shows FastQC after removing reads mapping the host genome.' + path_filters: + - '*cleaned_*.zip' + - kaiju + - quast: + name: 'Quast primary assembly' + info: 'This section of the report shows quast results after assembly.' + path_filters: + - '*quast_primary/*/report.tsv' + - quast: + name: 'Quast filtered assembly' + info: 'This section of the report shows quast results after assembly filtering.' + path_filters: + - '*quast_filtered/*/report.tsv' + - prokka: + name: 'Structural annotation' + info: 'This section of the report shows structural annotations results. CDS are predicted using Prodigal, rRNA using Barrnap and tRNA using tRNAscan-se.' + - featureCounts + - custom_content: + name: 'Binning results' + info: 'This section of the report shows quast results after binning the contigs into species genomes' + path_filters: + - '*mqc.tsv' + +report_section_order: + stats_table: + order: -1000 + bins_quality: + order: -1010 + bins_quality_count: + order: -1020 + bins_quality_size: + order: -1030 + binning_heatmap: + order: -1040 + software_versions: + order: -1050 + +prokka_fn_snames: True +prokka_table: True + +featurecounts: + fn: '*.summary' + shared: true + +table_columns_visible: + FastQC (raw): + percent_duplicates: False + percent_gc: False + Structural annotation: + organism: False + diff --git a/assets/multiqc_config.yaml b/assets/multiqc_config.yaml index fc5eb8eb127cf58a61e5b290ea7c552ae75bac65..c24c30505a78be352c69bfa26da761ff23615535 100644 --- a/assets/multiqc_config.yaml +++ b/assets/multiqc_config.yaml @@ -69,11 +69,14 @@ module_order: path_filters: - './final_assembly_flagstat/*' - prokka: + name: 'Structural annotation' + info: 'This section of the report shows structural annotations results. CDS are predicted using Prodigal, rRNA using Barrnap and tRNA using tRNAscan-se.' path_filters: - './prokka_report/*' - featureCounts: path_filters: - - './featureCounts_report/*' + - './featureCounts_report/*' + - '*.count_reads_on_contigs.flagstat' - custom_content: name: 'Binning results' info: 'This section of the report shows quast results after binning the contigs into species genomes' @@ -106,11 +109,12 @@ featurecounts: fn: '*.summary' shared: true + table_columns_visible: FastQC (raw): percent_duplicates: False percent_gc: False - prokka: + Structural annotation: organism: False Reads alignment on unfiltered assembly: mapped_passed_pct: True @@ -141,3 +145,4 @@ table_columns_visible: # - hide # show_hide_patterns: # - ["_R1", "_R2"] + diff --git a/assets/sr_multiqc_config.yaml b/assets/sr_multiqc_config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..18076ef26f91725b93b078a45c22cdaa7e1de3e0 --- /dev/null +++ b/assets/sr_multiqc_config.yaml @@ -0,0 +1,107 @@ +report_comment: > + This report has been generated by the <a href="https://forgemia.inra.fr/genotoul-bioinfo/metagwgs" target="_blank">genotoul-bioinfo/metagwgs</a> + analysis pipeline. For information about how to interpret these results, please see the + <a href="https://forgemia.inra.fr/genotoul-bioinfo/metagwgs" target="_blank">documentation</a>. + +extra_fn_clean_trim: + - "cleaned_" + - "raw_" + - '_kept_contigs' + - '.count_reads_on_contigs' + - '.no_filter' + - '.host_filter' + - '_scaffolds' + - '.txt' + - '.contigs' + - '.sort' + - '_kaiju_MEM_verbose' + - '_sickle' + - '_cutadapt' + +extra_fn_clean_exts: + - "_select_contigs_cpm" + +use_filename_as_sample_name: + - cutadapt + +module_order: + - fastqc: + name: 'FastQC (raw)' + path_filters_exclude: + - '*cleaned_*.zip' + - cutadapt + - sickle: + path_filters: + - '*_sickle.log' + - samtools: + name : 'Reads before host reads filter' + info: 'This section reports of the reads alignement against host genome with bwa.' + path_filters: + - '*.no_filter.flagstat' + - samtools: + name : 'Reads after host reads filter' + info: 'This section reports of the cleaned reads alignement against host genome with bwa.' + path_filters: + - '*.host_filter.flagstat' + - fastqc: + name: 'FastQC (cleaned)' + info: 'This section of the report shows FastQC results after adapter trimming and cleaning.' + path_filters: + - '*cleaned_*.zip' + - kaiju + - quast: + name: 'Quast primary assembly' + info: 'This section of the report shows quast results after assembly' + path_filters: + - '*quast_primary/*/report.tsv' + - quast: + name: 'Quast filtered assembly' + info: 'This section of the report shows quast results after filtering of assembly' + path_filters: + - '*quast_filtered/*/report.tsv' + - samtools: + name : 'Reads after deduplication' + info: 'This section reports of deduplicated reads alignement against contigs with bwa.' + path_filters: + - '*.count_reads_on_contigs.flagstat' + - prokka: + name: 'Structural annotation' + info: 'This section of the report shows structural annotations results. CDS are predicted using Prodigal, rRNA using Barrnap and tRNA using tRNAscan-se.' + - featureCounts + - featureCounts + - custom_content: + name: 'Binning results' + info: 'This section of the report shows quast results after binning the contigs into species genomes' + path_filters: + - '*mqc.tsv' + +report_section_order: + stats_table: + order: -1000 + bins_quality: + order: -1010 + bins_quality_count: + order: -1020 + bins_quality_size: + order: -1030 + binning_heatmap: + order: -1040 + software_versions: + order: -1050 + + + +prokka_fn_snames: True +prokka_table: True + +featurecounts: + fn: '*.summary' + shared: true + + +table_columns_visible: + FastQC (raw): + percent_duplicates: False + percent_gc: False + Structural annotation: + organism: False diff --git a/bin/Filter_contig_per_cpm.py b/bin/Filter_contig_per_cpm.py deleted file mode 100755 index 9127e57e3e0a1de965e97422adb819377210f94a..0000000000000000000000000000000000000000 --- a/bin/Filter_contig_per_cpm.py +++ /dev/null @@ -1,95 +0,0 @@ -#!/usr/bin/env python - -"""---------------------------------------------------------------------------- - Script Name: Filter_contig_per_cpm.py - Description: Calculates the CPM normalization of mapped reads for each \ - contig and returns contigs which have a CPM > cutoff in .fa. - Input files: Samtools idxstats output file, .fasta file of contigs. - Created By: Joanna Fourquet - Date: 2020-04-01 -------------------------------------------------------------------------------- -""" - -# Metadata -__author__ = 'Joanna Fourquet \ -- GenPhySE - Team NED' -__copyright__ = 'Copyright (C) 2020 INRA' -__license__ = 'GNU General Public License' -__version__ = '0.1' -__email__ = 'support.bioinfo.genotoul@inra.fr' -__status__ = 'dev' - -# Status: dev - -# Modules importation - -try: - import argparse - import sys - import pandas as p - import numpy as np - from Bio.Seq import Seq - from Bio.SeqRecord import SeqRecord - import pprint - from Bio import SeqIO -except ImportError as error: - print(error) - exit(1) - -################################################ -# Function -################################################ - -def cpm(counts): - N = np.sum(counts.iloc[:,2], axis=0) - C = counts.iloc[:,2] - cpm_values = 1e6 * C / N - return(cpm_values) - -def main(argv): - parser = argparse.ArgumentParser() - parser.add_argument("-i", "--samtools_idxstats", \ - required = True, help = "samtools idxstats file containing contig id, \ - sequence length, number of mapped reads or fragments, \ - number of unmapped reads or fragments") - parser.add_argument('-f', '--fasta_file', required = True, help = \ - 'fasta file containing sequences of contigs.') - parser.add_argument("-c", "--cutoff_cpm", required = True, \ - help = "Minimum number of reads in a contig") - parser.add_argument("-s", "--select", \ - help = "Name of outpout .fa file containing contigs which passed cpm cutoff") - parser.add_argument("-d", "--discard", \ - help = "Name of outpout .fa file containing contigs which don't passed cpm cutoff") - args = parser.parse_args() - - # Read input table - raw_counts = p.read_table(args.samtools_idxstats,sep ='\t',header = None, comment="*") - - # Calculates cpm for each contig - res_cpm = cpm(raw_counts) - - cutoff = float(args.cutoff_cpm) - # Contigs with nb reads > cutoff - kept_contigs = raw_counts.iloc[np.where(res_cpm >= cutoff)[0],0] - - # Contigs with nb reads < cutoff - unkept_contigs = raw_counts.iloc[np.where(res_cpm < cutoff)[0],0] - - # Write new fasta files with kept and unkept contigs - with open(args.fasta_file, "rU") as fasta_file,\ - open(args.select, "w") as out_select_handle,\ - open(args.discard, "w") as out_discard_handle: - for record in SeqIO.parse(fasta_file, "fasta"): - try : - contig_id = record.id - if(contig_id in list(kept_contigs)): - SeqIO.write(record, out_select_handle, "fasta") - else: - if(contig_id in list(unkept_contigs)): - SeqIO.write(record, out_discard_handle, "fasta") - except : - print ("Warning input fasta file: contig " + record.id + " issue") - pass - -if __name__ == "__main__": - main(sys.argv[1:]) diff --git a/bin/Rename_contigs_and_genes.py b/bin/Rename_contigs_and_genes.py deleted file mode 100755 index 7d0e94d4a9de9e5af57d1ebcd1b6ea0922f57acd..0000000000000000000000000000000000000000 --- a/bin/Rename_contigs_and_genes.py +++ /dev/null @@ -1,166 +0,0 @@ -#!/usr/bin/env python - -"""---------------------------------------------------------------------------------------------------------------------------------------------------------- - Script Name: Rename_contigs_and_genes.py - Description: Rename contigs and genes in GFF, FAA and FFN - files generated by PROKKA. - Input files: - GFF, FAA and FFN files produced by PROKKA. - Created By: Celine Noirot and Joanna Fourquet - Date: 2019-06-12 ----------------------------------------------------------------------------------------------------------------------------------------------------------- -""" - -# Metadata -__author__ = 'Celine Noirot and Joanna Fourquet \ -- Plateforme bioinformatique Toulouse' -__copyright__ = 'Copyright (C) 2019 INRA' -__license__ = 'GNU General Public License' -__version__ = '0.1' -__email__ = 'support.bioinfo.genotoul@inra.fr' -__status__ = 'dev' - -# Status: dev - -# Modules importation -try: - import argparse - from BCBio import GFF - from Bio.Seq import Seq - from Bio.SeqRecord import SeqRecord - from Bio.SeqFeature import SeqFeature, FeatureLocation - import pprint - from BCBio.GFF import GFFExaminer - from Bio import SeqIO -except ImportError as error: - print(error) - exit(1) - -# Manage parameters -parser = argparse.ArgumentParser(description = \ -'Script which rename contigs and genes in GFF, FAA and FFN files generated by PROKKA.') -parser.add_argument('-f', '--file', required = True, help = \ -'GFF file generated by PROKKA.') -parser.add_argument('-faa', '--fastaFile', required = True, \ -help = 'Fasta of predicted sequence (aa) generated by PROKKA (FAA).') -parser.add_argument('-ffn', '--ffnFile', required = True, \ -help = 'Fasta of predicted sequence (nuc) generated by PROKKA (FFN).') -parser.add_argument('-fna', '--fnaFile', required = True, \ -help = 'Fasta of contigs generated by PROKKA.') -parser.add_argument('-p', '--prefix', required = True, \ -help = 'Contig name prefix.') -parser.add_argument('-oGFF', '--outGFFFile', required = True, \ -help = 'Name of output GFF file.') -parser.add_argument('-oFAA', '--outFAAFile', required = True, \ -help = 'Filename of renamed predicted fasta sequences (aa).') -parser.add_argument('-oFFN', '--outFFNFile', required = True, \ -help = 'Filename of renamed predicted fasta sequences (nuc).') -parser.add_argument('-oFNA', '--outFNAFile', required = True, \ -help = 'Filename of renamed contig sequences (nuc).') -parser.add_argument('-oprottable', '--outProtein', default = "protein_table.csv", \ -help = 'Filename for protein names correspondance table.') -parser.add_argument('-oconttable', '--outContig', default = "contig_table.csv", \ -help = 'Filename for contig names correspondance table.') -args = parser.parse_args() - - -# Variable names informations: -# prot: corresponds to proteins -# ctg: corresponds to contigs - -# Variables initialization. -prot_names = {} -contig_renames = {} -ctg_prefix = args.prefix -prot_prefix = "Prot_" -to_write = [] -# lecture fna -#remplissage -#contig_renames [ald_name]=newname -#reecriture du fasta - -with open(args.fnaFile, "r") as fnaFile,\ - open(args.outFNAFile, "w") as outFNA_handle: - for record in SeqIO.parse(fnaFile, "fasta"): - try : - old_ctg_name = record.id - new_ctg_name = ctg_prefix + "_c" + old_ctg_name.split("_")[-1] - contig_renames[old_ctg_name] = new_ctg_name - record.id = contig_renames[old_ctg_name] - record.description = record.description.replace(old_ctg_name,"") - SeqIO.write(record, outFNA_handle, "fasta") - except : - print ("Warning FNA file : contig " + record.id + " discarded, no new name defined") - pass - -with open(args.file) as gffFile,\ - open(args.outGFFFile, "w") as out_handle,\ - open(args.outProtein, "w") as fh_prot_table,\ - open(args.outContig, "w") as fh_cont_table,\ - open(args.outContig + ".sed", "w") as fh_cont_sed: - for rec in GFF.parse(gffFile): - # Access to contig id - old_ctg_name = rec.id - new_ctg_name = contig_renames[old_ctg_name] - rec.id = new_ctg_name - - fh_cont_table.write(old_ctg_name + "\t" + new_ctg_name + "\n") - fh_cont_sed.write("s/" + old_ctg_name + "/" + new_ctg_name + "/\n") - # Access to features - for f_index,feature in enumerate(rec.features): - if(not(feature.qualifiers['source'][0].startswith("minced"))): - #Generate correspondance - old_prot_name = feature.qualifiers['ID'][0].replace("_gene","") - prot_number = old_prot_name.split("_")[-1] - - subfeat_types = {subfeat.type for subfeat in feature.sub_features} - assert len(subfeat_types) == 1, f'Subfeature have different types {subfeat_types}' - subfeat_type = subfeat_types.pop() - - - new_prot_name = f"{new_ctg_name}.{subfeat_type}_{prot_number}" - prot_names[old_prot_name] = new_prot_name - fh_prot_table.write(old_prot_name + "\t" + new_prot_name + "\n") - - #Initialize field of "gene" feature (the parent) - rec.features[f_index].qualifiers["ID"] = new_prot_name + "_gene" - rec.features[f_index].qualifiers["locus_tag"] = [new_prot_name] - - #Annotations (not prokka lines) are in sub_features - for fsub_index,sub_feature in enumerate(feature.sub_features): - # Update ID - rec.features[f_index].sub_features[fsub_index].qualifiers["ID"] = new_prot_name - rec.features[f_index].sub_features[fsub_index].qualifiers["Parent"] = [] - rec.features[f_index].sub_features[fsub_index].qualifiers["locus_tag"] = [new_prot_name] - rec.features[f_index].sub_features[fsub_index].qualifiers["protein_id"] = [new_prot_name] - to_write.append(rec) - - #Write only one time - #print (to_write) - GFF.write(to_write, out_handle) - - -with open(args.fastaFile, "r") as handle,\ - open(args.outFAAFile, "w") as outFasta_handle: - for record in SeqIO.parse(handle, "fasta"): - try : - old = record.id - record.id = prot_names[record.id] - record.description = record.description.replace(old + " ","") - SeqIO.write(record, outFasta_handle, "fasta") - except : - print ("Warning FAA file : protein " + record.id + " discarded, no new name defined") - pass - - -with open(args.ffnFile, "r") as handle,\ - open(args.outFFNFile, "w") as outFFN_handle: - for record in SeqIO.parse(handle, "fasta"): - try : - old = record.id - record.id = prot_names[record.id] - record.description = record.description.replace(old + " ","") - SeqIO.write(record, outFFN_handle, "fasta") - except : - print ("Warning FFN file : protein " + record.id + " discarded, no new name defined") - pass diff --git a/bin/aln2taxaffi.py b/bin/aln2taxaffi.py index 5c85116c0c11d166ad7c95f9a154549304f7accb..241d836b1216d0b0cf02e5a1032a15e7abc24b7e 100755 --- a/bin/aln2taxaffi.py +++ b/bin/aln2taxaffi.py @@ -597,13 +597,14 @@ def main(): logging.debug(contig_affi_line) - - with open(query_length_file) as fl: - nb_total_prot = len([line for line in fl]) - - nb_prot_annotated = len(matches) - plot_taxonomic_assignment( - output_name, count_rank_affiliation_protein, count_rank_affiliation_contig, nb_total_prot, nb_prot_annotated, nb_prot_assigned) + if query_length_file: + logging.debug("Plot taxonomic affiliation using protein lengths.") + with open(query_length_file) as fl: + nb_total_prot = len([line for line in fl]) + + nb_prot_annotated = len(matches) + plot_taxonomic_assignment( + output_name, count_rank_affiliation_protein, count_rank_affiliation_contig, nb_total_prot, nb_prot_annotated, nb_prot_assigned) if args.write_top_taxons: top_taxon_columns = ['contig', ] + main_ranks diff --git a/bin/filter_contig_per_cpm.py b/bin/filter_contig_per_cpm.py new file mode 100755 index 0000000000000000000000000000000000000000..fdea68d1d96b150bc8f2a09d7c328819055392be --- /dev/null +++ b/bin/filter_contig_per_cpm.py @@ -0,0 +1,130 @@ +#!/usr/bin/env python + +"""---------------------------------------------------------------------------- + Script Name: Filter_contig_per_cpm.py + Description: Calculates the CPM normalization of mapped reads for each + contig and returns contigs which have a CPM > cutoff in .fa. + Input files: Samtools idxstats output file, .fasta file of contigs. + Created By: Jean Mainguy + Date: 2022-24-10 +------------------------------------------------------------------------------- +""" + +# Metadata +__author__ = 'Mainguy Jean - Plateforme bioinformatique Toulouse' +__copyright__ = 'Copyright (C) 2022 INRAE' +__license__ = 'GNU General Public License' +__version__ = '0.1' +__email__ = 'support.bioinfo.genotoul@inra.fr' +__status__ = 'dev' + +# Status: dev + +# Modules importation + + +from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter +import pandas as pd +import numpy as np +import logging +import pyfastx + +################################################ +# Function +################################################ + +def parse_arguments(): + """Parse script arguments.""" + parser = ArgumentParser(description="...", + formatter_class=ArgumentDefaultsHelpFormatter) + + parser.add_argument("-i", "--samtools_idxstats", nargs='+', required = True, + help = "samtools idxstats file containing contig id, \ + sequence length, number of mapped reads or fragments, \ + number of unmapped reads or fragments") + + parser.add_argument('-f', '--fasta_file', required = True, + help = 'fasta file containing sequences of contigs.') + parser.add_argument("-c", "--cutoff_cpm", required = True, + help = "Minimum number of reads in a contig") + parser.add_argument("-s", "--select", + help = "Name of outpout .fa file containing contigs which passed cpm cutoff") + parser.add_argument("-d", "--discard", + help = "Name of outpout .fa file containing contigs which don't passed cpm cutoff") + + + parser.add_argument("-v", "--verbose", help="increase output verbosity", + action="store_true") + + args = parser.parse_args() + return args + +def combine_idxstat_files(idxstat_files): + """ + Combine multiple idxstat files that have the same contigs. + + Sum the #_mapped_read_segments column over multiple idxstat files that have the same reference sequences. + """ + columns_names = ['reference_sequence_name', + 'sequence_length', + '#_mapped_read_segments', + '#_unmapped_read-segments'] + + idxstat_df = pd.read_csv(idxstat_files[0], + sep ='\t', + names = columns_names, + usecols = ['reference_sequence_name', + 'sequence_length', + '#_mapped_read_segments',], + comment="*").set_index('reference_sequence_name') + + for idxstat_file in idxstat_files[1:]: + other_idxstat_df = pd.read_csv(idxstat_file, + sep ='\t', + names = columns_names, + usecols = ['reference_sequence_name', + '#_mapped_read_segments',], + comment="*").set_index('reference_sequence_name') + + idxstat_df['#_mapped_read_segments'] += other_idxstat_df['#_mapped_read_segments'] + + return idxstat_df + +def main(): + args = parse_arguments() + + if args.verbose: + logging.basicConfig(format="%(levelname)s: %(message)s", level=logging.DEBUG) + logging.info('Mode verbose ON') + + else: + logging.basicConfig(format="%(levelname)s: %(message)s") + + cpm_cutoff = float(args.cutoff_cpm) + + # Read input tables + idxstat_df = combine_idxstat_files(args.samtools_idxstats) + + # Calculates cpm for each contig + sum_reads = idxstat_df['#_mapped_read_segments'].sum() + logging.info(f'Total number of mapped reads {sum_reads}') + + logging.info(f'With a cpm cutoff of {args.cutoff_cpm}, contigs with less than {(sum_reads*cpm_cutoff)/1e6} reads are removed.') + idxstat_df['cpm_count'] = 1e6 * idxstat_df['#_mapped_read_segments']/sum_reads + + + # Contigs with nb reads > cutoff + kept_contigs = idxstat_df.loc[idxstat_df["cpm_count"] >= cpm_cutoff].index + + logging.info(f'{len(kept_contigs)}/{len(idxstat_df)} contigs are kept with a cpm cutoff of {cpm_cutoff}.') + # Write new fasta files with kept and unkept contigs + with open(args.select, "w") as out_select_handle, open(args.discard, "w") as out_discard_handle: + + for contig, seq in pyfastx.Fasta(args.fasta_file, build_index=False): + if contig in kept_contigs: + out_select_handle.write(f'>{contig}\n{seq}\n') + else: + out_discard_handle.write(f'>{contig}\n{seq}\n') + +if __name__ == "__main__": + main() diff --git a/bin/merge_annotations.py b/bin/merge_annotations.py new file mode 100755 index 0000000000000000000000000000000000000000..d18ebd03b4ca221a59e8fd24791d5c4af60be26a --- /dev/null +++ b/bin/merge_annotations.py @@ -0,0 +1,261 @@ +#!/usr/bin/env python3 + +""" +Combine structural annotations made from different gff. + +In case of collapsing annotations, RNA annotation are prefered to CDS annotations. +Faa file from prodigal is processed to filter out overlapping sequences. + +:Example: +merge_annotations.py -c prodigal.gff -r barrnap.gff -t trnascan_se.gff --contig_seq contigs.fasta --faa_file prodigal.faa +""" + +# Metadata +__author__ = 'Mainguy Jean - Plateforme bioinformatique Toulouse' +__copyright__ = 'Copyright (C) 2020 INRAE' +__license__ = 'GNU General Public License' +__version__ = '0.1' +__email__ = 'support.bioinfo.genotoul@inra.fr' +__status__ = 'dev' + + +from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter, FileType +import logging +import gzip +import csv +from collections import defaultdict +import pyfastx +from Bio.Seq import Seq + + +def parse_arguments(): + """Parse script arguments.""" + parser = ArgumentParser(description="...", + formatter_class=ArgumentDefaultsHelpFormatter) + + parser.add_argument('-c', '--cds', help='gff file with CDS annotations.', required=True) + + parser.add_argument('-t', '--trna', help='gff file with tRNA annotations.', required=True) + + parser.add_argument('-r', '--rrna', help='gff file with rRNA annotations.', required=True) + + parser.add_argument('--contig_seq', help='fasta file of contigs. Needed to extract gene sequence', required=True) + + parser.add_argument('--faa_file', help='fasta file of protein sequences generated by prodigal.', required=True) + + parser.add_argument('--gff_output', help='final gff file with all annotations.', default="all_annotation.gff") + + parser.add_argument('--ffn_output', help='final ffn file with all annotations.', default="all_annotation.ffn") + + parser.add_argument('--faa_output', help='final faa file with all CDS annotations in amino acid.', default="all_annotation.faa") + + parser.add_argument('--report_output', help='Prokka report like to be able to show annotations in multiqc.', default="annotation_report.txt") + + + + parser.add_argument("-v", "--verbose", help="increase output verbosity", + action="store_true") + + args = parser.parse_args() + return args + +def read_file_and_ignore_hashtag(file_path): + + proper_open = gzip.open if file_path.endswith('.gz') else open + with proper_open(file_path, 'rt') as fl: + for line in fl: + if line.startswith('#'): + continue + yield line + +def group_annotations_by_contigs(*gff_annotations): + + contig2annotations = defaultdict(list) + for annotations in gff_annotations: + for annotation in annotations: + contig2annotations[annotation['seqname']].append(annotation) + + return contig2annotations + + +def parse_gff_file(gff_file, feature_to_keep=False): + + gff_headers = ("seqname", "_3", "feature", "start", "end", "_2", "strand", "_1", "attribute") + + gff_annotations = csv.DictReader(read_file_and_ignore_hashtag(gff_file), + delimiter='\t', + fieldnames=gff_headers) + + if feature_to_keep: + logging.info(f'Keeping only {feature_to_keep} feature from {gff_file}') + gff_annotations = (annotation for annotation in gff_annotations if annotation['feature'] == feature_to_keep) + + return gff_annotations + +def remove_overlapping_cds(cds_file, contig2rnas): + + contig_annotations = [] + current_contig = None + + for cds in parse_gff_file(cds_file, 'CDS'): + + reading_next_contig = cds['seqname'] != current_contig + + if contig_annotations and reading_next_contig: + + yield current_contig, contig_annotations + contig_annotations = [] + + current_contig = cds['seqname'] + + is_overlapping = False + for rna in contig2rnas[current_contig]: + if (int(rna['end']) < int(cds['start']) or int(rna['start']) > int(cds['end'])): + continue + + else: # overlap -> remove cds + is_overlapping = True + overlap = f"[{max(rna['start'], cds['start'])},{min(rna['end'], cds['end'])}]" + rna_info = f"{rna['feature']} [{rna['start']}, {rna['end']}]" + cds_info = f"CDS [{cds['start']}, {cds['end']}]" + logging.info(f"overlap: {cds_info} overlapping with {rna_info} at {overlap} on contig={current_contig}") + + if not is_overlapping: + contig_annotations.append(cds) + + yield current_contig, contig_annotations + +def merging_cds_and_rna(cds_per_contig, contig2rnas): + + contig_processed = [] + for contig, cds_features in cds_per_contig: + contig_processed.append(contig) + rna_features = contig2rnas[contig] + yield contig, rna_features + cds_features + + # check that all rrna contigs have been processed if not processe them + for contig, rnas in contig2rnas.items(): + if contig not in contig_processed: + logging.info(f'{contig} has only rna annotation') + yield contig, rnas + +def add_new_ID_tag(gff_attributes, new_id): + + gff_attributes_no_id = ';'.join([attr for attr in gff_attributes.split(';') if attr.split('=')[0] != 'ID']) + return f"ID={new_id};{gff_attributes_no_id}" + + +def get_tag_value(gff_attribute, tag): + + for attr in gff_attribute.split(';'): + if attr.split('=')[0] == tag: + return attr.split('=')[1] + return '' + +def writing_features_to_gff_ffn_faa(annotations_per_contig, out_gff, fna_file, out_ffn, faa_file, out_faa): + + contig_fa = pyfastx.Fasta(fna_file) + protein_fa = pyfastx.Fasta(faa_file) + + with open(out_gff, 'w') as fh_gff, open(out_ffn, 'w') as fh_ffn, open(out_faa, 'w') as fh_faa: + + fh_gff.write("##gff-version 3\n") + for contig, features in annotations_per_contig: + gene_count = defaultdict(int) + logging.info(f"writing {contig} annotations to gff file") + fh_gff.write(f"##sequence-region {contig}\n") + + for feature in sorted(features, key=lambda x: int(x['start'])): + gene_count[feature['feature']] += 1 + new_id = f"{feature['seqname']}.{feature['feature']}_{gene_count[feature['feature']]}" + + start, end = int(feature['start']), int(feature['end']) + + # write faa + if feature['feature'] == "CDS": + gff_id = get_tag_value(feature['attribute'], 'ID') + + # ID in gff from prodigal is <contig_number>_<cds_number> + # seq name in faa from prodigal is <contig_name>_<cds_number> + cds_number = gff_id.split('_')[-1] + cds_prodigal_id = f"{contig}_{cds_number}" + fh_faa.write(f">{new_id} {start}-{end}\n{protein_fa[cds_prodigal_id]}\n") + + + # Write gff and ffn + feature['attribute'] = add_new_ID_tag(feature['attribute'], new_id) + + tag_name = get_tag_value(feature['attribute'], 'Name') + fh_gff.write('\t'.join(feature.values())+"\n") + + if feature['strand'] == "+": + feature_seq = contig_fa[contig][start-1:end].seq + else: + # minus strand: reverse complement the sequence + feature_seq = contig_fa[contig][start-1:end].antisense + + fh_ffn.write(f">{new_id} {start}-{end} {tag_name}\n{feature_seq}\n") + + # building prokka like report for multiqc + report = {"organism": "NA", + "contigs": len(contig_fa), + "bases": contig_fa.size } + report.update(gene_count) + + return report + + + + + +def main(): + + args = parse_arguments() + + if args.verbose: + logging.basicConfig(format="%(levelname)s: %(message)s", level=logging.DEBUG) + logging.info('Mode verbose ON') + + else: + logging.basicConfig(format="%(levelname)s: %(message)s") + + + trna_file = args.trna + rrna_file = args.rrna + cds_file = args.cds + + + fna_file = args.contig_seq + faa_file = args.faa_file + + out_gff = args.gff_output + out_ffn = args.ffn_output + out_faa = args.faa_output + out_report = args.report_output + + + logging.info('Parsing rRNA annoations.') + rrna_annotations = parse_gff_file(rrna_file, 'rRNA') + + logging.info('Parsing tRNA annoations.') + trna_annotations = parse_gff_file(trna_file, 'tRNA') + + logging.info('Grouping RNA annoations by contig.') + contig2rnas = group_annotations_by_contigs(trna_annotations, rrna_annotations) + + logging.info('Removing CDS annoations overlapping a RNA annotations.') + unoverlapping_cds_per_contig = remove_overlapping_cds(cds_file, contig2rnas) + + logging.info('Merge CDS and RNA annotations by contig.') + annotation_per_contigs = merging_cds_and_rna(unoverlapping_cds_per_contig, contig2rnas) + + logging.info(f'Writting CDS and RNA annotations to gff file: {out_gff}.') + report = writing_features_to_gff_ffn_faa(annotation_per_contigs, out_gff, fna_file, out_ffn, faa_file, out_faa) + + with open(out_report, "w") as fl: + fl.write(''.join(f"{k}: {v}\n" for k,v in report.items())) + + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/bin/rename_contigs.py b/bin/rename_contigs.py new file mode 100755 index 0000000000000000000000000000000000000000..0734c9c5e297259fa8c0df0ee03015b108280f65 --- /dev/null +++ b/bin/rename_contigs.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python3 + +""" +Rename assembly contigs. + +Rename contig as <sample_name>_c<contig_number> + +:Example: +rename_contigs.py -h +""" + +# Metadata +__author__ = 'Mainguy Jean - Plateforme bioinformatique Toulouse' +__copyright__ = 'Copyright (C) 2020 INRAE' +__license__ = 'GNU General Public License' +__version__ = '0.1' +__email__ = 'support.bioinfo.genotoul@inra.fr' +__status__ = 'dev' + + +from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter, FileType +import logging +import gzip +import csv +from collections import defaultdict +import pyfastx +from Bio.Seq import Seq + + +def parse_arguments(): + """Parse script arguments.""" + parser = ArgumentParser(description="...", + formatter_class=ArgumentDefaultsHelpFormatter) + + parser.add_argument('-s', '--sample', help='Sample name used to rename contigs.', required=True) + + parser.add_argument('-i', '--fna_file', help='Original fasta file of contigs.', required=True) + + parser.add_argument('-o', '--out_fna', help='Output fasta file with renamed contigs.', required=True) + + parser.add_argument("-v", "--verbose", help="increase output verbosity", + action="store_true") + + args = parser.parse_args() + return args + + + +def main(): + + args = parse_arguments() + + if args.verbose: + logging.basicConfig(format="%(levelname)s: %(message)s", level=logging.DEBUG) + logging.info('Mode verbose ON') + + else: + logging.basicConfig(format="%(levelname)s: %(message)s") + + sample = args.sample + fna_file = args.fna_file + out_fna = args.out_fna + + + logging.info(f'Writting renamed fasta file in {out_fna}') + with open(out_fna, "w") as fh_fna: + for i, (_, seq) in enumerate(pyfastx.Fasta(fna_file, build_index=False)): + fh_fna.write(f'>{sample}_c{i+1}\n{seq}\n') + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/bin/scrape_software_versions.py b/bin/scrape_software_versions.py index e04c4d3fa7789ae889bc21828f956e77be2af0a0..42794741c9302e0e76893f21c4335a39d25c890d 100755 --- a/bin/scrape_software_versions.py +++ b/bin/scrape_software_versions.py @@ -23,7 +23,6 @@ regexes = { 'Hifiasm': ['v_hifiasm_meta.txt', r"ha base version: (\S+)"], 'MetaFlye': ['v_metaflye.txt', r"v(\S+)"], 'Quast': ['v_quast.txt', r"QUAST v(\S+)"], - 'Prokka': ['v_prokka.txt', r"prokka (\S+)"], 'Kaiju': ['v_kaiju.txt', r"Kaiju (\S+)"], 'Samtools': ['v_samtools.txt', r"samtools (\S+)"], 'Eggnog-Mapper': ['v_eggnogmapper.txt', r"emapper-(\S+)"], @@ -33,7 +32,10 @@ regexes = { 'CheckM2': ['v_checkm2.txt', r"(\S+)"], 'Metawrap': ['v_metawrap.txt', r"metawrap (\S+)"], 'GTDBTK': ['v_gtdbtk.txt', r"...::: GTDB-Tk v(\S+)"], - 'dRep': ['v_dRep.txt', r"version (\S+)"] + 'dRep': ['v_dRep.txt', r"version (\S+)"], + 'tRNAscan-SE': ['v_tRNAscan.txt', r"tRNAscan-SE (\S+)"], + 'Barrnap': ['v_barrnap.txt', r"barrnap (\S+)"], + 'Prodigal': ['v_prodigal.txt', r"Prodigal (\S+):"], } results = OrderedDict() results['metagWGS'] = '<span style="color:#999999;\">N/A</span>' @@ -65,6 +67,9 @@ results['Metawrap'] = '<span style="color:#999999;\">N/A</span>' results['dRep'] = '<span style="color:#999999;\">N/A</span>' results['GTDBTK'] = '<span style="color:#999999;\">N/A</span>' results['MultiQC'] = '<span style="color:#999999;\">N/A</span>' +results['tRNAscan-SE'] = '<span style="color:#999999;\">N/A</span>' +results['Barrnap'] = '<span style="color:#999999;\">N/A</span>' +results['Prodigal'] = '<span style="color:#999999;\">N/A</span>' # Search each file using its regex for k, v in regexes.items(): diff --git a/conf/base.config b/conf/base.config index 607ae05250e35b438b10db8996b40eff575f9d9e..721713b52e7f984bcfc58969b1784f33f79b15b8 100644 --- a/conf/base.config +++ b/conf/base.config @@ -55,15 +55,8 @@ process { memory = { 32.GB * task.attempt } } withLabel: ASSEMBLY_FILTER { - memory = { 8.GB * task.attempt } - cpus = 4 - } - withName: PROKKA { - memory = { 45.GB * task.attempt } - cpus = 8 - } - withName: RENAME_CONTIGS_AND_GENES { - memory = { 20.GB * task.attempt } + memory = { 2.GB * task.attempt } + cpus = 1 } withLabel: CD_HIT { memory = { 50.GB * task.attempt } diff --git a/conf/test.config b/conf/test.config index 710b959e18f9fff612ec836a1f6d90bbccd1ded3..8b437cdfc845c560673445737439168cbc359b64 100644 --- a/conf/test.config +++ b/conf/test.config @@ -46,16 +46,9 @@ process { memory = { 1.GB * task.attempt } } withLabel: ASSEMBLY_FILTER { - memory = { 1.GB * task.attempt } - cpus = 2 - } - withName: PROKKA { memory = { 1.GB * task.attempt } cpus = 1 } - withName: RENAME_CONTIGS_AND_GENES { - memory = { 1.GB * task.attempt } - } withLabel: CD_HIT { memory = { 16.GB * task.attempt } cpus = 2 diff --git a/docs/usage.md b/docs/usage.md index 672fa3a91602e436949b36080a577090c67ab788..d0b1a4c0c74d5ee76787982dd29c8ab8af7f3633 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -300,7 +300,7 @@ No parameter available for this substep. No parameters. **WARNING 6:** `S04_STRUCTURAL_ANNOT` step depends on `S01_CLEAN_QC`, `S02_ASSEMBLY` and `S03_FILTERING` steps (if you use it). You need to use the mandatory files of these four steps to run `S04_STRUCTURAL_ANNOT`. See [II. Input files](usage.md#ii-input-files) and **WARNINGS 1 to 6**. - +<!-- **WARNING 7:** if you haven't previously done `S03_FILTERING`, calculation time of `S04_STRUCTURAL_ANNOT` can be important. Some cluster queues have defined calculation time, you need to adapt the queue you use to your data. > For example, if you are on [genologin cluster](http://bioinfo.genotoul.fr/) and you haven't done the `S03_FILTERING` step, you can write a `nextflow.config` file in your working directory containing these lines: > ```bash @@ -308,7 +308,7 @@ No parameters. > queue = 'unlimitq' > } > ``` -> This will launch the `Prokka` command line of step `04_STRUCTURAL_ANNOT` on a calculation queue (`unlimitq`) where the job can last more than 4 days (which is not the case for the usual `workq` queue). +> This will launch the `Prokka` command line of step `04_STRUCTURAL_ANNOT` on a calculation queue (`unlimitq`) where the job can last more than 4 days (which is not the case for the usual `workq` queue). --> #### **`S05_ALIGNMENT` step:** diff --git a/env/metagWGS.yml b/env/metagWGS.yml index 8a0cc9e45be7283620d558c12c9510dd76dfac94..6a0b710fcdb61b986cda0081ea4a77595ea0af73 100644 --- a/env/metagWGS.yml +++ b/env/metagWGS.yml @@ -28,5 +28,5 @@ dependencies: - flye=2.9.1 - hifiasm_meta=hamtv0.3 - plotly + - trnascan-se=2.0.11 - pyfastx - diff --git a/main.nf b/main.nf index 229345d0296d2c45815b0b32559b36fd138d96ab..cdd2d19cc94c965ad40b3d6af8b3cf1f68c4be9e 100644 --- a/main.nf +++ b/main.nf @@ -287,7 +287,7 @@ workflow { ch_kaiju_db = Channel.empty() ch_eggnog_db = Channel.empty() ch_taxonomy = Channel.empty() - ch_diamond = Channel.empty() + ch_diamon_db = Channel.empty() ch_gtbdtk_db = Channel.empty() @@ -297,7 +297,7 @@ workflow { ch_kaiju_db = DATABASES.out.kaiju_db ch_eggnog_db = DATABASES.out.eggnog ch_taxonomy = DATABASES.out.taxonomy - ch_diamond = DATABASES.out.diamond + ch_diamon_db = DATABASES.out.diamond ch_gtbdtk_db = DATABASES.out.gtdbtk ch_multiqc_config = Channel.empty() @@ -317,7 +317,7 @@ workflow { ch_final_assembly_flagstat_report = Channel.empty() ch_assembly_report = Channel.empty() ch_filtered_report = Channel.empty() - ch_prokka_report = Channel.empty() + ch_annotation_report = Channel.empty() ch_bins_abundances_report = Channel.empty() ch_bins_stats_report = Channel.empty() ch_quast = Channel.empty() @@ -404,92 +404,82 @@ workflow { ///////////////////// S02_ASSEMBLY ( ch_preprocessed_reads, ch_assembly, has_assembly, assembly_tool ) + ch_assembly = S02_ASSEMBLY.out.assembly - ch_reads = S02_ASSEMBLY.out.dedup + ch_reads = S02_ASSEMBLY.out.reads + ch_bam = S02_ASSEMBLY.out.bam + + ch_sam_coverage = S02_ASSEMBLY.out.coverage ch_idxstats = S02_ASSEMBLY.out.idxstats ch_unfilter_assembly_flagstat_report = S02_ASSEMBLY.out.flagstat - ch_quast = S02_ASSEMBLY.out.assembly_report - ch_quast_before_filter_report = ch_quast.map{ meta, quast_report -> quast_report } + + ch_quast_before_filter_report = S02_ASSEMBLY.out.assembly_report ch_software_versions_S02 = S02_ASSEMBLY.out.software_versions } if ( !params.stop_at_clean && !params.stop_at_assembly && !params.skip_filtering ) { + ch_min_contigs_cpm = Channel.value(params.min_contigs_cpm) - ch_assembly - .splitFasta(by: 100000, file: true) - .set{ch_chunk_assembly_for_filter} - ch_chunk_assembly_for_filter - .combine(ch_idxstats, by:0) - .set{ch_assembly_and_idxstats} S03_FILTERING ( - ch_assembly_and_idxstats, + ch_assembly, + ch_idxstats, + ch_bam, ch_min_contigs_cpm ) - ch_assembly = S03_FILTERING.out.selected - ch_quast = S03_FILTERING.out.report - ch_quast_after_filter_report = ch_quast.map{ meta, quast_report -> quast_report } + ch_assembly = S03_FILTERING.out.selected_contigs + ch_bam = S03_FILTERING.out.bam + + ch_sam_coverage = S03_FILTERING.out.sam_coverage + ch_quast_after_filter_report = S03_FILTERING.out.quast_report + ch_final_assembly_flagstat_report = S03_FILTERING.out.sam_flagstat + } - ch_contigs_and_reads = Channel.empty() - ch_prokka_ffn = Channel.empty() - ch_prokka_faa = Channel.empty() - ch_prokka_gff = Channel.empty() - ch_prokka_fna = Channel.empty() - ch_prot_length = Channel.empty() + ch_annotation_ffn = Channel.empty() + ch_annotation_faa = Channel.empty() + ch_annotation_gff = Channel.empty() if ( !params.stop_at_clean && !params.stop_at_assembly && !params.stop_at_filtering ) { S04_STRUCTURAL_ANNOT ( ch_assembly ) - ch_prokka_ffn = S04_STRUCTURAL_ANNOT.out.ffn - ch_prokka_faa = S04_STRUCTURAL_ANNOT.out.faa - ch_prokka_gff = S04_STRUCTURAL_ANNOT.out.gff - ch_prokka_fna = S04_STRUCTURAL_ANNOT.out.fna - ch_prokka_report = S04_STRUCTURAL_ANNOT.out.report - - ch_contigs_and_reads = ch_prokka_fna - .join(ch_reads, remainder: true) - ch_prot_length = S04_STRUCTURAL_ANNOT.out.prot_length + ch_annotation_ffn = S04_STRUCTURAL_ANNOT.out.ffn + ch_annotation_faa = S04_STRUCTURAL_ANNOT.out.faa + ch_annotation_gff = S04_STRUCTURAL_ANNOT.out.gff + ch_annotation_report = S04_STRUCTURAL_ANNOT.out.report ch_software_versions_S04 = S04_STRUCTURAL_ANNOT.out.software_versions } - ch_bam = Channel.empty() - ch_m8 = Channel.empty() - ch_sam_coverage = Channel.empty() + ch_diamond_result = Channel.empty() if ( !params.stop_at_clean && !params.stop_at_assembly && !params.stop_at_filtering && !params.stop_at_structural_annot ) { - S05_ALIGNMENT ( ch_contigs_and_reads, ch_prokka_faa, ch_diamond) - ch_bam = S05_ALIGNMENT.out.bam - ch_m8 = S05_ALIGNMENT.out.m8 - ch_sam_coverage = S05_ALIGNMENT.out.sam_coverage - - if (!params.skip_filtering){ - // when filtering is skip reads vs assembly remain unchanged so no need to send it to multiqc - ch_final_assembly_flagstat_report = S05_ALIGNMENT.out.sam_flagstat - } + S05_ALIGNMENT (ch_annotation_faa, ch_diamon_db) + + ch_diamond_result = S05_ALIGNMENT.out.m8 + ch_software_versions_S05 = S05_ALIGNMENT.out.software_versions } ch_quant_report = Channel.empty() ch_v_eggnogmapper = Channel.empty() if ( !params.stop_at_clean && !params.stop_at_assembly && !params.stop_at_filtering && !params.stop_at_structural_annot && !params.skip_func_annot ) { - S06_FUNC_ANNOT ( ch_prokka_ffn, ch_prokka_faa, ch_prokka_gff, ch_bam, ch_m8, ch_eggnog_db ) + S06_FUNC_ANNOT ( ch_annotation_ffn, ch_annotation_faa, ch_annotation_gff, ch_bam, ch_diamond_result, ch_eggnog_db ) ch_quant_report = S06_FUNC_ANNOT.out.quant_report ch_software_versions_S06 = S06_FUNC_ANNOT.out.software_versions } if ( !params.stop_at_clean && !params.stop_at_assembly && !params.stop_at_filtering && !params.stop_at_structural_annot && !params.skip_taxo_affi ) { - S07_TAXO_AFFI ( ch_taxonomy, ch_m8, ch_sam_coverage, ch_prot_length) + S07_TAXO_AFFI ( ch_taxonomy, ch_diamond_result, ch_sam_coverage) ch_software_versions_S07 = S07_TAXO_AFFI.out.software_versions } if ( !params.stop_at_clean && !params.stop_at_assembly && !params.stop_at_filtering && !params.stop_at_structural_annot && !params.skip_binning ) { - S08_BINNING( ch_reads, ch_prokka_fna, ch_bam, ch_gtbdtk_db, ch_quast ) + S08_BINNING( ch_reads, ch_assembly, ch_bam, ch_gtbdtk_db, ch_quast ) ch_bins_abundances_report = S08_BINNING.out.bins_abundances_report ch_bins_stats_report = S08_BINNING.out.bins_stats_report @@ -518,7 +508,7 @@ workflow { ch_quast_before_filter_report.collect().ifEmpty([]), ch_quast_after_filter_report.collect().ifEmpty([]), ch_final_assembly_flagstat_report.collect().ifEmpty([]), - ch_prokka_report.collect().ifEmpty([]), + ch_annotation_report.collect().ifEmpty([]), ch_quant_report.collect().ifEmpty([]), ch_bins_abundances_report.collect().ifEmpty([]), ch_bins_stats_report.collect().ifEmpty([]) diff --git a/modules/assembly.nf b/modules/assembly.nf index b8f9d358356e83bc05e93457c96efa5674ffffb1..91552700071535ce00f8ff4f46438b31cd8d1247 100644 --- a/modules/assembly.nf +++ b/modules/assembly.nf @@ -107,3 +107,20 @@ process METAFLYE { """ } +process RENAME_CONTIGS { + tag "${meta.id}" + publishDir "${params.outdir}/02_assembly/", mode: 'copy' + label 'PYTHON' + + input: + tuple val(meta), path("${meta.id}.raw.fna") + + output: + tuple val(meta), path("${meta.id}.fna"), emit: fna + + script: + """ + rename_contigs.py --sample ${meta.id} --fna_file ${meta.id}.raw.fna --out_fna ${meta.id}.fna -v + + """ +} \ No newline at end of file diff --git a/modules/assign_taxonomy.nf b/modules/assign_taxonomy.nf index 9d765ad9b0d1d2634c3b457714ca787f9ce39a77..559c06cbb7276a222a0ee53f1476a86450e74216 100644 --- a/modules/assign_taxonomy.nf +++ b/modules/assign_taxonomy.nf @@ -5,13 +5,12 @@ process ASSIGN_TAXONOMY { input: tuple path(accession2taxid), path(new_taxdump) - tuple val(meta), path(m8), path(sam_coverage), path(prot_len) + tuple val(meta), path(m8), path(sam_coverage) output: tuple val(meta.id), path("${meta.id}.percontig.tsv"), emit: t_percontig tuple val(meta.id), path("${meta.id}.pergene.tsv"), emit: t_pergene tuple val(meta.id), path("${meta.id}.warn.tsv"), emit: t_warn - tuple val(meta.id), path("graphs"), emit: t_graphs path "${meta.id}_quantif_percontig.tsv", emit: q_all path "${meta.id}_quantif_percontig_by_superkingdom.tsv", emit: q_superkingdom path "${meta.id}_quantif_percontig_by_phylum.tsv", emit: q_phylum @@ -41,7 +40,7 @@ process ASSIGN_TAXONOMY { aln2taxaffi.py -a ${accession2taxid} --taxonomy \$new_taxdump_var \ -o ${meta.id} -b ${m8} --keep_only_best_aln \ - --query_length_file ${prot_len} -v --write_top_taxons + -v --write_top_taxons merge_contig_quantif_perlineage.py -c ${meta.id}.percontig.tsv -s ${sam_coverage} -o ${meta.id} new_taxdump_original=$new_taxdump diff --git a/modules/barrnap.nf b/modules/barrnap.nf new file mode 100644 index 0000000000000000000000000000000000000000..16013a09c886a200ca87ba970420ec2c1ec3fd72 --- /dev/null +++ b/modules/barrnap.nf @@ -0,0 +1,18 @@ +process BARRNAP { + tag "${meta.id}" + + input: + tuple val(meta), file(assembly_file) + + output: + tuple val(meta), path("barrnap.gff"), emit: gff + path "v_barrnap.txt", emit: v_barrnap + + script: + """ + barrnap --threads ${task.cpus} ${assembly_file} > barrnap.gff + + barrnap --version 2> v_barrnap.txt + + """ +} diff --git a/modules/filtering_cpm.nf b/modules/filtering_cpm.nf index 951a2018db7e7aed1f5c0915a8e54e68f7595c6f..cdfe8e52bd7a719851503a64284724c4c5efac64 100644 --- a/modules/filtering_cpm.nf +++ b/modules/filtering_cpm.nf @@ -12,7 +12,7 @@ process CHUNK_ASSEMBLY_FILTER { script: chunk_name = assembly_file.baseName """ - Filter_contig_per_cpm.py -i ${idxstats} -f ${assembly_file} -c ${min_cpm} -s ${chunk_name}_select_cpm${min_cpm}.fasta -d ${chunk_name}_discard_cpm${min_cpm}.fasta + filter_contig_per_cpm.py -v -i ${idxstats} -f ${assembly_file} -c ${min_cpm} -s ${chunk_name}_select_cpm${min_cpm}.fasta -d ${chunk_name}_discard_cpm${min_cpm}.fasta """ } diff --git a/modules/merge_annotations.nf b/modules/merge_annotations.nf new file mode 100644 index 0000000000000000000000000000000000000000..c6481f640067f8a93a9e295b653a9ec9733b4faf --- /dev/null +++ b/modules/merge_annotations.nf @@ -0,0 +1,21 @@ +process MERGE_ANNOTATIONS { + publishDir "${params.outdir}/04_structural_annot/${meta.id}/", mode: 'copy' + tag "${meta.id}" + + input: + tuple val(meta), file(assembly_file), file(faa_file), file(cds_gff), file(rrna_gff), file(trna_gff) + + output: + tuple val(meta), file("${meta.id}.gff"), emit: gff + tuple val(meta), file("${meta.id}.ffn"), emit: ffn + tuple val(meta), file("${meta.id}.faa"), emit: faa + path "${meta.id}.txt", emit: report + + script: + """ + merge_annotations.py -c $cds_gff -r $rrna_gff -t $trna_gff -v \ + --contig_seq $assembly_file --faa_file $faa_file \ + --ffn_output ${meta.id}.ffn --gff_output ${meta.id}.gff --faa_output ${meta.id}.faa \ + --report_output ${meta.id}.txt + """ +} \ No newline at end of file diff --git a/modules/metaquast.nf b/modules/metaquast.nf index dbaa508907511d05a022c258e8ff9b6ffdde63da..dee90d58aa2e646adb54b24fece1a54f3a5be520 100644 --- a/modules/metaquast.nf +++ b/modules/metaquast.nf @@ -9,7 +9,7 @@ process QUAST { output: path "${outdir}/${meta.id}/*", emit: all - tuple val(meta), path ("${outdir}/${meta.id}/report.tsv"), emit: report + path "${outdir}/${meta.id}/report.tsv", emit: report path "v_quast.txt", emit: v_quast script: diff --git a/modules/prodigal.nf b/modules/prodigal.nf new file mode 100644 index 0000000000000000000000000000000000000000..63bde69805b4f69bd03d76bfe1cf3a42a381c186 --- /dev/null +++ b/modules/prodigal.nf @@ -0,0 +1,19 @@ +process PRODIGAL { + tag "${meta.id}" + + input: + tuple val(meta), file(assembly_file) + + output: + tuple val(meta), path("prodigal.gff"), emit: gff + tuple val(meta), path("prodigal.faa"), emit: faa + path "v_prodigal.txt", emit: v_prodigal + + script: + """ + prodigal -i ${assembly_file} -c -p meta -f gff -a prodigal.faa -o prodigal.gff + + prodigal -v 2> v_prodigal.txt + + """ +} \ No newline at end of file diff --git a/modules/prokka.nf b/modules/prokka.nf deleted file mode 100644 index 6fbd538686fe545b4d1de3d08011a3a8dd8189ea..0000000000000000000000000000000000000000 --- a/modules/prokka.nf +++ /dev/null @@ -1,48 +0,0 @@ -process PROKKA { - tag "${meta.id}" - - input: - tuple val(meta), file(assembly_file) - - output: - tuple val(meta), path("PROKKA_${meta.id}"), emit: prokka_results - path "PROKKA_${meta.id}/${meta.id}.txt", emit: report - path "v_prokka.txt", emit: v_prokka - - script: - """ - prokka --metagenome --noanno --rawproduct --outdir PROKKA_${meta.id} --prefix ${meta.id} ${assembly_file} --centre X --compliant --cpus ${task.cpus} - rm PROKKA_${meta.id}/*.gbk - gt gff3validator PROKKA_${meta.id}/${meta.id}.gff - - prokka -v &> v_prokka.txt - """ -} - -process RENAME_CONTIGS_AND_GENES { - tag "${meta.id}" - publishDir "${params.outdir}/04_structural_annot", mode: 'copy' - label 'PYTHON' - - input: - tuple val(meta), path(prokka_results) - - output: - tuple val(meta), path("${meta.id}.annotated.fna"), emit: fna - tuple val(meta), path("${meta.id}.annotated.ffn"), emit: ffn - tuple val(meta), path("${meta.id}.annotated.faa"), emit: faa - tuple val(meta), path("${meta.id}.annotated.gff"), emit: gff - tuple val(meta), path("${meta.id}_prot.len"), emit: prot_length - - script: - """ - grep "^gnl" ${prokka_results}/${meta.id}.gff > ${meta.id}_only_gnl.gff - - Rename_contigs_and_genes.py -f ${meta.id}_only_gnl.gff -faa ${prokka_results}/${meta.id}.faa \ - -ffn ${prokka_results}/${meta.id}.ffn -fna ${prokka_results}/${meta.id}.fna \ - -p ${meta.id} -oGFF ${meta.id}.annotated.gff -oFAA ${meta.id}.annotated.faa \ - -oFFN ${meta.id}.annotated.ffn -oFNA ${meta.id}.annotated.fna - - samtools faidx ${meta.id}.annotated.faa; cut -f 1,2 ${meta.id}.annotated.faa.fai > ${meta.id}_prot.len - """ -} diff --git a/modules/read_alignment_manipulation.nf b/modules/read_alignment_manipulation.nf new file mode 100644 index 0000000000000000000000000000000000000000..ca0fe5dd3abef86bfc16183aa7a33d113b6aa377 --- /dev/null +++ b/modules/read_alignment_manipulation.nf @@ -0,0 +1,104 @@ +process GET_ALIGNMENT_METRICS { + tag "${meta.id}" + publishDir "${params.outdir}/$publishDir_path/${meta.id}", mode: 'copy' + + input: + tuple val(meta), path(bam), path(bai) + val(publishDir_path) + + output: + tuple val(meta), path("${meta.id}.coverage.tsv"), emit: sam_coverage + tuple val(meta), path("${meta.id}.idxstats"), emit: sam_idxstat + path "${meta.id}.flagstat", emit: sam_flagstat + path "${meta.id}*" + path "v_samtools.txt", emit: v_samtools + + script: + """ + samtools flagstat -@ ${task.cpus} ${bam} > ${meta.id}.flagstat + samtools coverage ${bam} > ${meta.id}.coverage.tsv + samtools idxstats ${bam} > ${meta.id}.idxstats + + samtools --version &> v_samtools.txt + """ +} + + +process FILTER_BAM { + tag "${meta.id}" + + input: + tuple val(meta), path(discarded_contigs), path(bam), path(bai) + + output: + tuple val(meta), path("selected_contigs.bam"), emit: selected_contigs_bam + tuple val(meta), path("discarded_contigs.bam"), emit: discarded_contigs_bam + + + script: + """ + + samtools faidx $discarded_contigs + awk 'BEGIN {FS="\\t"}; {print \$1 FS "0" FS \$2}' ${discarded_contigs}.fai > discarded_contigs.bed + + + samtools view -@ ${task.cpus} -bL discarded_contigs.bed -U selected_contigs.bam $bam -o discarded_contigs.bam + + + """ +} + +process EXTRACT_PAIRED_READS { + tag "${meta.id}" + + input: + tuple val(meta), path(bam) + + output: + tuple val(meta), path("reads*.fastq.gz"), emit: reads + + + script: + """ + + samtools fastq -@ ${task.cpus} -N -1 reads_R1.fastq.gz -2 reads_R2.fastq.gz $bam + + """ +} + +process EXTRACT_SINGLE_READS { + tag "${meta.id}" + + input: + tuple val(meta), path(bam) + + output: + tuple val(meta), path("reads.fastq.gz"), emit: reads + + script: + """ + + samtools fastq -@ ${task.cpus} -o reads.fastq.gz $bam + + """ +} + + + +process MERGE_BAM_FILES { + tag "${meta.id}" + + input: + tuple val(meta), path(bam1), path(bam2) + + output: + tuple val(meta), path("${meta.id}.filtered.bam"), path("${meta.id}.filtered.bam.bai") , emit: bam + + script: + """ + samtools merge -@ ${task.cpus} -o ${meta.id}.filtered.bam $bam1 $bam2 + samtools index -@ ${task.cpus} ${meta.id}.filtered.bam + + """ +} + diff --git a/modules/read_alignment_metrics.nf b/modules/read_alignment_metrics.nf deleted file mode 100644 index e23dc78beff36d661c7a5c9b3a250856b69e5013..0000000000000000000000000000000000000000 --- a/modules/read_alignment_metrics.nf +++ /dev/null @@ -1,22 +0,0 @@ -process SAMTOOLS { - tag "${meta.id}" - publishDir "${params.outdir}/$publishDir_path/${meta.id}", mode: 'copy' - - input: - tuple val(meta), path(bam), path(bai) - val(publishDir_path) - - output: - tuple val(meta), path("${meta.id}.coverage.tsv"), emit: sam_coverage - tuple val(meta), path("${meta.id}.idxstats"), emit: sam_idxstat - path "${meta.id}.flagstat", emit: sam_flagstat - path "${meta.id}*" - - - script: - """ - samtools flagstat -@ ${task.cpus} ${bam} > ${meta.id}.flagstat - samtools coverage ${bam} > ${meta.id}.coverage.tsv - samtools idxstats ${bam} > ${meta.id}.idxstats - """ -} \ No newline at end of file diff --git a/modules/reads_deduplication.nf b/modules/reads_deduplication.nf index b64df7ca1edbf142aa2875281915a66f0fcdad64..bce077d8e6ba3dd99ede048e1822a95745440818 100644 --- a/modules/reads_deduplication.nf +++ b/modules/reads_deduplication.nf @@ -1,38 +1,46 @@ process READS_DEDUPLICATION { tag "${meta.id}" - publishDir "${params.outdir}/02_assembly", mode: 'copy', pattern: '*.fastq.gz' - publishDir "${params.outdir}/02_assembly/logs", mode: 'copy', pattern: '*.idxstats' - publishDir "${params.outdir}/02_assembly/logs", mode: 'copy', pattern: '*.flagstat' + publishDir "${params.outdir}/02_assembly/deduplicated_reads/", mode: 'copy', pattern: '*.fastq.gz' + publishDir "${params.outdir}/02_assembly/${meta.id}/", mode: 'copy', pattern: '*.fastq.gz' + publishDir "${params.outdir}/02_assembly/${meta.id}/", mode: 'copy', pattern: '*.bam*' input: tuple val(meta), path(assembly), path(reads) output: - tuple val(meta), path("${meta.id}*_dedup.fastq.gz"), emit: dedup - tuple val(meta), path("${meta.id}.count_reads_on_contigs.idxstats"), emit: idxstats + tuple val(meta), path("${meta.id}*_dedup.fastq.gz"), emit: dedup_reads + tuple val(meta), path("${meta.id}.bam"), path("${meta.id}.bam.bai"), emit: bam + path "${meta.id}_singletons.fastq.gz", emit: singletons - path "${meta.id}.count_reads_on_contigs.flagstat", emit: flagstat + path "v_bwa.txt", emit: v_bwa_mem2 path "v_samtools.txt", emit: v_samtools script: """ - mkdir logs + # Align reads against assembly bwa-mem2 index ${assembly} -p ${assembly} - bwa-mem2 mem ${assembly} ${reads[0]} ${reads[1]} | samtools view -bS - | samtools sort -n -o ${meta.id}.sort.bam - - samtools fixmate -m ${meta.id}.sort.bam ${meta.id}.fixmate.bam - samtools sort -o ${meta.id}.fixmate.positionsort.bam ${meta.id}.fixmate.bam - samtools markdup -r -S -s -f ${meta.id}.stats ${meta.id}.fixmate.positionsort.bam ${meta.id}.filtered.bam - samtools index ${meta.id}.filtered.bam - samtools idxstats ${meta.id}.filtered.bam > ${meta.id}.count_reads_on_contigs.idxstats - samtools flagstat ${meta.id}.filtered.bam > ${meta.id}.count_reads_on_contigs.flagstat - samtools sort -n -o ${meta.id}.filtered.sort.bam ${meta.id}.filtered.bam - samtools fastq -N -s ${meta.id}_singletons.fastq.gz -1 ${meta.id}_R1_dedup.fastq.gz -2 ${meta.id}_R2_dedup.fastq.gz ${meta.id}.filtered.sort.bam + bwa-mem2 mem -t ${task.cpus} ${assembly} ${reads[0]} ${reads[1]} | samtools view -@ ${task.cpus} -bS - | samtools sort -@ ${task.cpus} -n -o ${meta.id}.sort.bam - + + # Identify and removed duplicated reads from the bam file + samtools fixmate -@ ${task.cpus} -m ${meta.id}.sort.bam ${meta.id}.fixmate.bam + samtools sort -@ ${task.cpus} -o ${meta.id}.fixmate.positionsort.bam ${meta.id}.fixmate.bam + samtools markdup -@ ${task.cpus} -r -S -s -f ${meta.id}.stats ${meta.id}.fixmate.positionsort.bam ${meta.id}.filtered.bam + + # final bam file without duplicated reads + samtools sort -@ ${task.cpus} -o ${meta.id}.bam ${meta.id}.filtered.bam + samtools index -@ ${task.cpus} ${meta.id}.bam + + # Get deduplicated reads + samtools sort -@ ${task.cpus} -n -o ${meta.id}.filtered.n_sorted.bam ${meta.id}.bam + samtools fastq -@ ${task.cpus} -N -s ${meta.id}_singletons.fastq.gz -1 ${meta.id}_R1_dedup.fastq.gz -2 ${meta.id}_R2_dedup.fastq.gz ${meta.id}.filtered.n_sorted.bam + + # clean directory rm ${meta.id}.sort.bam rm ${meta.id}.fixmate.bam rm ${meta.id}.fixmate.positionsort.bam - rm ${meta.id}.filtered.bam - rm ${meta.id}.filtered.sort.bam + rm ${meta.id}.filtered.n_sorted.bam + rm ${meta.id}.filtered.bam bwa-mem2 version > v_bwa.txt diff --git a/modules/trnascan_se.nf b/modules/trnascan_se.nf new file mode 100644 index 0000000000000000000000000000000000000000..b690c7c42a1bbe808d6e63c4e3136a747bd02b1b --- /dev/null +++ b/modules/trnascan_se.nf @@ -0,0 +1,19 @@ +process TRNASCAN_SE { + tag "${meta.id}" + + input: + tuple val(meta), file(assembly_file) + + output: + tuple val(meta), path("trnascan_se.gff"), emit: gff + path "v_tRNAscan.txt", emit: v_tRNAscan + + script: + """ + tRNAscan-SE -B --gff trnascan_se.gff --thread ${task.cpus} --stats trnascan_se.log ${assembly_file} + + tRNAscan-SE -h 2> v_tRNAscan.txt + + + """ +} diff --git a/subworkflows/02_assembly.nf b/subworkflows/02_assembly.nf index 84117284a987587fbd6741a457ddd7f3a662b4c0..fe90d448a0c7d68b5b231f95cadca428f5a889de 100644 --- a/subworkflows/02_assembly.nf +++ b/subworkflows/02_assembly.nf @@ -1,15 +1,15 @@ -include { METASPADES; MEGAHIT; HIFIASM_META; METAFLYE } from '../modules/assembly' +include { METASPADES; MEGAHIT; HIFIASM_META; METAFLYE; RENAME_CONTIGS } from '../modules/assembly' include { QUAST as ASSEMBLY_QUAST} from '../modules/metaquast' include { READS_DEDUPLICATION } from '../modules/reads_deduplication' include { MINIMAP2 } from '../modules/read_alignment' -include { SAMTOOLS } from '../modules/read_alignment_metrics' +include { GET_ALIGNMENT_METRICS } from '../modules/read_alignment_manipulation' workflow STEP_02_ASSEMBLY { take: - reads - assembly - has_assembly - assembly_tool + reads + assembly + has_assembly + assembly_tool main: ch_assembler_v = Channel.empty() @@ -47,40 +47,40 @@ workflow STEP_02_ASSEMBLY { } } - ASSEMBLY_QUAST( ch_assembly,"02_assembly/quast_primary") + ch_assembly_renamed = RENAME_CONTIGS(ch_assembly) + ASSEMBLY_QUAST( ch_assembly_renamed,"02_assembly/quast_primary") ch_assembly_report = ASSEMBLY_QUAST.out.report ch_quast_v = ASSEMBLY_QUAST.out.v_quast - ch_dedup = Channel.empty() ch_idxstats = Channel.empty() - ch_dedup_report = Channel.empty() ch_flagstat = Channel.empty() ch_reads = reads - ch_contigs_and_reads = ch_assembly.join(ch_reads, remainder: true) + ch_contigs_and_reads = ch_assembly_renamed.join(ch_reads, remainder: true) if ( params.type.toUpperCase() == "SR" ) { + READS_DEDUPLICATION ( ch_contigs_and_reads ) - ch_dedup = READS_DEDUPLICATION.out.dedup - ch_idxstats = READS_DEDUPLICATION.out.idxstats - ch_flagstat = READS_DEDUPLICATION.out.flagstat + + ch_reads = READS_DEDUPLICATION.out.dedup_reads + ch_bam = READS_DEDUPLICATION.out.bam ch_bwa_mem_v = READS_DEDUPLICATION.out.v_bwa_mem2 - ch_samtools_v = READS_DEDUPLICATION.out.v_samtools + } else { - ch_dedup = ch_reads - if (!params.skip_filtering) { MINIMAP2(ch_contigs_and_reads,"02_assembly") ch_bam = MINIMAP2.out.bam - SAMTOOLS(ch_bam, "02_assembly") - ch_idxstats = SAMTOOLS.out.sam_idxstat - ch_flagstat = SAMTOOLS.out.sam_flagstat ch_minimap2_v = MINIMAP2.out.v_minimap2 - ch_samtools_v = MINIMAP2.out.v_samtools - } - + } + GET_ALIGNMENT_METRICS(ch_bam, "02_assembly") + + ch_idxstats = GET_ALIGNMENT_METRICS.out.sam_idxstat + ch_flagstat = GET_ALIGNMENT_METRICS.out.sam_flagstat + ch_coverage = GET_ALIGNMENT_METRICS.out.sam_coverage + ch_samtools_v = GET_ALIGNMENT_METRICS.out.v_samtools + ch_software_versions = ch_assembler_v.first().mix( ch_quast_v.first(), ch_bwa_mem_v.first(), ch_minimap2_v.first(), @@ -88,10 +88,12 @@ workflow STEP_02_ASSEMBLY { emit: - assembly = ch_assembly - dedup = ch_dedup + assembly = ch_assembly_renamed + reads = ch_reads + bam = ch_bam idxstats = ch_idxstats flagstat = ch_flagstat + coverage = ch_coverage assembly_report = ch_assembly_report software_versions = ch_software_versions } \ No newline at end of file diff --git a/subworkflows/03_filtering.nf b/subworkflows/03_filtering.nf index b13e4900c335639aef1a8fba745aef9762424799..be38cd68cfb37858a6a386553a46f73cd1f58017 100644 --- a/subworkflows/03_filtering.nf +++ b/subworkflows/03_filtering.nf @@ -1,14 +1,25 @@ include { CHUNK_ASSEMBLY_FILTER; MERGE_ASSEMBLY_FILTER} from '../modules/filtering_cpm.nf' -include { QUAST as FILTERED_QUAST } from '../modules/metaquast' +include { QUAST } from '../modules/metaquast' +include { MINIMAP2; BWA_MEM } from '../modules/read_alignment' +include { GET_ALIGNMENT_METRICS; FILTER_BAM; EXTRACT_SINGLE_READS; EXTRACT_PAIRED_READS; MERGE_BAM_FILES} from '../modules/read_alignment_manipulation' workflow STEP_03_ASSEMBLY_FILTER { take: - assembly_and_idxstats + assemblies + idxstats + bam min_cpm main: + ch_chunk_assembly_for_filter = assemblies + .splitFasta(by: 100000, file: true) + + + ch_assembly_and_idxstats = ch_chunk_assembly_for_filter + .combine(idxstats, by:0) + CHUNK_ASSEMBLY_FILTER ( - assembly_and_idxstats, + ch_assembly_and_idxstats, min_cpm ) ch_chunk_selected = CHUNK_ASSEMBLY_FILTER.out.chunk_selected @@ -28,12 +39,67 @@ workflow STEP_03_ASSEMBLY_FILTER { min_cpm ) ch_merged_selected = MERGE_ASSEMBLY_FILTER.out.merged_selected + discarded_contigs = MERGE_ASSEMBLY_FILTER.out.merged_discarded + + + // Differentiate sample with and without discarded_contigs + // samples with no discarded_contigs are not going to be processed to process + discarded_contigs_category = discarded_contigs.branch{ + without: it[1].isEmpty() + with: !it[1].isEmpty() + } + + + samples_without_discarded_contigs = discarded_contigs_category.without.map{ it -> it[0]} + ch_bam_of_sample_without_discarded_contigs = samples_without_discarded_contigs.join(bam) + + ch_discarded_contigs_and_bam = discarded_contigs_category.with.join(bam) + + FILTER_BAM(ch_discarded_contigs_and_bam) + + ch_discarded_bam = FILTER_BAM.out.discarded_contigs_bam + ch_selected_bam = FILTER_BAM.out.selected_contigs_bam + + if ( params.type.toUpperCase() == "SR" ) { + + ch_discarded_reads = EXTRACT_PAIRED_READS(ch_discarded_bam) + + ch_contigs_and_discarded_reads = ch_merged_selected.join(ch_discarded_reads) - FILTERED_QUAST( ch_merged_selected, "03_filtering/quast_filtered" ) - ch_filtered_report = FILTERED_QUAST.out.report + BWA_MEM(ch_contigs_and_discarded_reads, "03_filtering") + ch_bam_with_discarded_reads = BWA_MEM.out.bam + } else { + ch_discarded_reads = EXTRACT_SINGLE_READS(ch_discarded_bam) + ch_contigs_and_discarded_reads = ch_merged_selected.join(ch_discarded_reads) + MINIMAP2(ch_contigs_and_discarded_reads, "03_filtering") + ch_bam_with_discarded_reads = MINIMAP2.out.bam + + } + + + // remove bai from ch_bam_with_discarded_reads and add bam of selected contigs + ch_bam_to_merge = ch_bam_with_discarded_reads.map{ it -> [it[0], it[1]]}.join(ch_selected_bam) + + ch_merged_bam = MERGE_BAM_FILES(ch_bam_to_merge) + + ch_final_bam = ch_bam_of_sample_without_discarded_contigs.mix(ch_merged_bam) + + + GET_ALIGNMENT_METRICS(ch_final_bam, '03_filtering/') + + ch_flagstat = GET_ALIGNMENT_METRICS.out.sam_flagstat + ch_coverage = GET_ALIGNMENT_METRICS.out.sam_coverage + + QUAST( ch_merged_selected, "03_filtering/quast_filtered" ) + ch_quast_report = QUAST.out.report emit: - selected = ch_merged_selected - report = ch_filtered_report -} \ No newline at end of file + selected_contigs = ch_merged_selected + quast_report = ch_quast_report + bam = ch_final_bam + sam_coverage = ch_coverage + sam_flagstat = ch_flagstat + +} + diff --git a/subworkflows/04_structural_annot.nf b/subworkflows/04_structural_annot.nf index 61bff361e55fc240ffabdaeda83f5ceb9070c3a1..49841efd8fefc08910d4463108fea3820a57293b 100644 --- a/subworkflows/04_structural_annot.nf +++ b/subworkflows/04_structural_annot.nf @@ -1,19 +1,31 @@ -include { PROKKA; RENAME_CONTIGS_AND_GENES } from '../modules/prokka' +include { PRODIGAL } from '../modules/prodigal' +include { BARRNAP } from '../modules/barrnap' +include { TRNASCAN_SE } from '../modules/trnascan_se' +include { MERGE_ANNOTATIONS } from '../modules/merge_annotations' + workflow STEP_04_STRUCTURAL_ANNOT { take: assembly main: - PROKKA( assembly ) - ch_software_versions = PROKKA.out.v_prokka.first() - RENAME_CONTIGS_AND_GENES(PROKKA.out.prokka_results) + PRODIGAL( assembly ) + BARRNAP( assembly ) + TRNASCAN_SE( assembly ) + + annotations_ch = assembly.join(PRODIGAL.out.faa).join(PRODIGAL.out.gff).join(BARRNAP.out.gff) + .join(TRNASCAN_SE.out.gff) + + MERGE_ANNOTATIONS(annotations_ch) + + ch_software_versions = PRODIGAL.out.v_prodigal.first() + + ch_software_versions = PRODIGAL.out.v_prodigal.first().mix( BARRNAP.out.v_barrnap.first(), + TRNASCAN_SE.out.v_tRNAscan.first()) emit: - report = PROKKA.out.report - fna = RENAME_CONTIGS_AND_GENES.out.fna - ffn = RENAME_CONTIGS_AND_GENES.out.ffn - gff = RENAME_CONTIGS_AND_GENES.out.gff - faa = RENAME_CONTIGS_AND_GENES.out.faa - prot_length = RENAME_CONTIGS_AND_GENES.out.prot_length + report = MERGE_ANNOTATIONS.out.report + ffn = MERGE_ANNOTATIONS.out.ffn + gff = MERGE_ANNOTATIONS.out.gff + faa = MERGE_ANNOTATIONS.out.faa software_versions = ch_software_versions } \ No newline at end of file diff --git a/subworkflows/05_alignment.nf b/subworkflows/05_alignment.nf index 1d7c54350f72445779c076e2960f26587b03c65a..990223080f9035a29a23d1e177f7bece3d46eeef 100644 --- a/subworkflows/05_alignment.nf +++ b/subworkflows/05_alignment.nf @@ -1,37 +1,13 @@ -include { MINIMAP2; BWA_MEM } from '../modules/read_alignment' -include { SAMTOOLS } from '../modules/read_alignment_metrics' include { DIAMOND } from '../modules/diamond' workflow STEP_05_ALIGNMENT { take: - contigs_and_reads prokka_faa diamond main: - ch_bwa_mem_v = Channel.empty() - ch_minimap2_v = Channel.empty() - ch_samtools_v = Channel.empty() ch_diamond_v = Channel.empty() - publishDir = "05_alignment/05_1_reads_alignment_on_contigs" - if (params.type == 'SR') { - BWA_MEM(contigs_and_reads, publishDir) - ch_bam = BWA_MEM.out.bam - - ch_bwa_mem_v = BWA_MEM.out.v_bwa_mem2 - ch_samtools_v = BWA_MEM.out.v_samtools - } else { - MINIMAP2(contigs_and_reads, publishDir) - ch_bam = MINIMAP2.out.bam - - ch_minimap2_v = MINIMAP2.out.v_minimap2 - ch_samtools_v = MINIMAP2.out.v_samtools - } - - SAMTOOLS(ch_bam, publishDir) - ch_sam_coverage = SAMTOOLS.out.sam_coverage - ch_sam_flagstat = SAMTOOLS.out.sam_flagstat ch_m8 =Channel.empty() if (!params.skip_func_annot || !params.skip_taxo_affi){ @@ -42,13 +18,7 @@ workflow STEP_05_ALIGNMENT { ch_m8 = DIAMOND.out.m8 ch_diamond_v = DIAMOND.out.v_diamond } - ch_software_versions = ch_bwa_mem_v.first().mix(ch_minimap2_v.first(), - ch_samtools_v.first(), - ch_diamond_v.first()) emit: - bam = ch_bam m8 = ch_m8 - sam_coverage = ch_sam_coverage - software_versions = ch_software_versions - sam_flagstat = ch_sam_flagstat + software_versions = ch_diamond_v.first() } diff --git a/subworkflows/07_taxonomic_affi.nf b/subworkflows/07_taxonomic_affi.nf index d1042c07ba7a873ae86b1083ea589d586a6ac3d7..8eb26400f25ba65fe4c3f3d8dda3acc8f3e58330 100644 --- a/subworkflows/07_taxonomic_affi.nf +++ b/subworkflows/07_taxonomic_affi.nf @@ -7,10 +7,8 @@ workflow STEP_07_TAXO_AFFI { taxonomy diamond_result // channel: [ val(meta), path(diamond_file) ] sam_coverage // channel: [ val(meta), path(samtools coverage) ] - prot_length // channel: [ val(meta), path(prot_length) ] main: ch_assign_taxo_input = diamond_result.join(sam_coverage, remainder: true) - .join(prot_length, remainder: true) ASSIGN_TAXONOMY ( taxonomy, ch_assign_taxo_input ) diff --git a/subworkflows/08_binning.nf b/subworkflows/08_binning.nf index 97df1a6a74234e90e106b75e4a962aea98c20756..59b9c7b7b7dbf16051dfb3d0c38c4d4370f24e92 100644 --- a/subworkflows/08_binning.nf +++ b/subworkflows/08_binning.nf @@ -2,7 +2,7 @@ include { GENERATE_DEPTH_FILES; METABAT2; MAXBIN2; CONCOCT; METAWRAP_REFINMENT; include { CHECKM2 } from '../modules/checkm2' include { DREP } from '../modules/drep' include { BWA_MEM;BWA_MEM as BWA_MEM_CROSS_ALIGNMENT; MINIMAP2; MINIMAP2 as MINIMAP2_CROSS_ALIGNMENT } from '../modules/read_alignment' -include { SAMTOOLS } from '../modules/read_alignment_metrics' +include { GET_ALIGNMENT_METRICS } from '../modules/read_alignment_manipulation' include { GTDBTK } from '../modules/gtdbtk' include { GENOMES_ABUNDANCES_PER_SAMPLE; ADD_QUAST_INFO_TO_BINS; BINS_STATS_TO_MUTLIQC_FORMAT} from '../modules/sum_up_bins_informations' @@ -213,11 +213,11 @@ workflow STEP_08_BINNING { ch_collect_coverages = Channel.empty() ch_collect_flagstats = Channel.empty() - SAMTOOLS(ch_bam, "08_binning/08_4_mapping_on_final_bins/stats") + GET_ALIGNMENT_METRICS(ch_bam, "08_binning/08_4_mapping_on_final_bins/stats") - ch_collect_coverages = SAMTOOLS.out.sam_coverage.map {id, file -> file} + ch_collect_coverages = GET_ALIGNMENT_METRICS.out.sam_coverage.map {id, file -> file} .collect() - ch_collect_flagstats = SAMTOOLS.out.sam_flagstat.collect() + ch_collect_flagstats = GET_ALIGNMENT_METRICS.out.sam_flagstat.collect() GENOMES_ABUNDANCES_PER_SAMPLE(ch_collect_coverages, ch_collect_flagstats, \ ch_bins_drep, DREP.out.output_drep_stats, GTDBTK.out.gtdbtk_affiliations_predictions,