From 5a2b42a5432b7f16ef8b6d267b63edc91d7a58fc Mon Sep 17 00:00:00 2001 From: "sukanya.denni" <sukanya.denni@inrae.fr> Date: Fri, 17 Nov 2023 09:14:44 +0100 Subject: [PATCH 01/25] fix snp --- bed2vcf.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/bed2vcf.py b/bed2vcf.py index 1506eff..4f04fba 100644 --- a/bed2vcf.py +++ b/bed2vcf.py @@ -15,6 +15,11 @@ def read_fa(input_file): def DNA(length): return ''.join(random.choice('CGTA') for _ in range(length)) +def SNP(aa): + s = 'ACGT' + s = s.replace(aa, '') + return(random.choice(s)) + # check if reverse def is_reverse(s): return(s=="reverse") @@ -35,6 +40,13 @@ def set_ref_alt(ref, alt, row, vcf_df): vcf_df.at[row, "ALT"] = alt def get_random_len(svtype): + # if svtype == "SNP": + # return(1) + # else: + # d = {"deletion": "DEL", "insertion" : "INS", "inversion" : "INV", + # "CNV", "duplication" : "DUP", "inverted tandem duplication" : "DUP"} + # svtype = d[svtype] + # INS, DEL, DUP, CNV, INV df = pd.read_csv("sv_distributions/size_distrib" + svtype + ".tsv", sep="\t") pb = df["pb"].tolist() @@ -68,8 +80,9 @@ def get_seq(vcf_df, bed_df, fa_dict, output_file): fasta_seq = fa_dict[chr_name].upper() if sv_type == "SNP": - ref = str(fasta_seq.seq[start:end]) - alt = sv_info[4] + ref = str(fasta_seq.seq[start]) + # alt = sv_info[4] + alt = SNP(ref) elif sv_type == "deletion": end = start + get_random_len("DEL") @@ -94,7 +107,7 @@ def get_seq(vcf_df, bed_df, fa_dict, output_file): cp = int(sv_info[4])-1 # multiplication pour obtenir les copies alt_seq = alt_seq1*cp - if sv_type =="inverted tandem duplication": + if sv_type == "inverted tandem duplication": alt_seq = reverse(alt_seq) alt = ref + alt_seq -- GitLab From 3dff0e2fe597ca9d34409a61a4bb90b147ca2e53 Mon Sep 17 00:00:00 2001 From: "sukanya.denni" <sukanya.denni@inrae.fr> Date: Fri, 17 Nov 2023 09:14:54 +0100 Subject: [PATCH 02/25] add header size --- fusion.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fusion.py b/fusion.py index ea0696c..68a138a 100644 --- a/fusion.py +++ b/fusion.py @@ -4,7 +4,7 @@ import io, os, shutil import pandas as pd -def read_vcf(input_file): +def read_vcf(input_file, header_size=6): with open(input_file, "r") as txt: t = txt.read() txt.close() @@ -13,11 +13,11 @@ def read_vcf(input_file): if not os.path.exists("results"): os.mkdir("results") f = open("results/vcf_header.txt", "w") - f.write(''.join(t.splitlines(keepends=True)[:6])) + f.write(''.join(t.splitlines(keepends=True)[:header_size])) f.close() # remove VCF header for dataframe - t = ''.join(t.splitlines(keepends=True)[5:]) + t = ''.join(t.splitlines(keepends=True)[header_size-1:]) df = pd.read_table(io.StringIO(t)) df = df.rename(columns={"#CHROM" : "CHROM"}) return(df) -- GitLab From fb5ed09037f54b03b48a0386c7884301d114f0bb Mon Sep 17 00:00:00 2001 From: "sukanya.denni" <sukanya.denni@inrae.fr> Date: Fri, 17 Nov 2023 09:15:17 +0100 Subject: [PATCH 03/25] clean up --- VISOR_random_bed/randomregion.r | 20 ++++++++++---------- main.py | 23 ++++------------------- tree_generation.py | 4 ++-- 3 files changed, 16 insertions(+), 31 deletions(-) diff --git a/VISOR_random_bed/randomregion.r b/VISOR_random_bed/randomregion.r index 64741c4..a6ca763 100644 --- a/VISOR_random_bed/randomregion.r +++ b/VISOR_random_bed/randomregion.r @@ -1,13 +1,13 @@ -if (!requireNamespace("BiocManager", quietly = TRUE)) - install.packages("BiocManager",repos='http://cran.us.r-project.org') -if (!requireNamespace("optparse", quietly = TRUE)) - install.packages("optparse",repos='http://cran.us.r-project.org') -if (!requireNamespace("data.table", quietly = TRUE)) - install.packages("data.table",repos='http://cran.us.r-project.org') -if (!requireNamespace("regioneR", quietly = TRUE)) - BiocManager::install("regioneR") -if (!requireNamespace("Rsamtools", quietly = TRUE)) - BiocManager::install("Rsamtools") +# if (!requireNamespace("BiocManager", quietly = TRUE)) +# install.packages("BiocManager",repos='http://cran.us.r-project.org') +# if (!requireNamespace("optparse", quietly = TRUE)) +# install.packages("optparse",repos='http://cran.us.r-project.org') +# if (!requireNamespace("data.table", quietly = TRUE)) +# install.packages("data.table",repos='http://cran.us.r-project.org') +# if (!requireNamespace("regioneR", quietly = TRUE)) +# BiocManager::install("regioneR") +# if (!requireNamespace("Rsamtools", quietly = TRUE)) +# BiocManager::install("Rsamtools") suppressPackageStartupMessages(library(optparse)) diff --git a/main.py b/main.py index da8b303..991a5ce 100644 --- a/main.py +++ b/main.py @@ -4,7 +4,7 @@ from tree_generation import msprime_vcf import defopt ## n : length of variant -def sv_vcf(bed: str, vcf: str, fasta: str, outName: str): +def sv_vcf(*, bed: str, vcf: str, fasta: str, outName: str, header: int): """ Create a VCF with structural variants. @@ -12,10 +12,11 @@ def sv_vcf(bed: str, vcf: str, fasta: str, outName: str): :parameter vcf: VCF generated with msprime --> tree generation script :parameter fasta: Reference FASTA for the VCF and the variants :parameter outName: Output name + :parameter header: vcf header size """ bed_df = replace_bed_col(bed, vcf) - vcf_df=read_vcf(vcf) + vcf_df=read_vcf(vcf, header) fa = read_fa(fasta) get_seq(vcf_df, bed_df, fa, outName) @@ -26,20 +27,4 @@ def sv_vcf(bed: str, vcf: str, fasta: str, outName: str): merge_final_vcf(header_file, content_file, merged_file) if __name__ == '__main__': - defopt.run([sv_vcf, msprime_vcf]) - -bed = "test/mini_random.bed" -vcf = "test/output.vcf" -fasta = "/home/sukanya/tests/02_data/hackathon_Ztritici/CHR8/ztIPO323.chr8.fasta" - -output_file="tritici_test" -# main(bed, vcf, fasta, output_file) - -### generate msprime population VCF -# fai = "ztIPO323.chr8.fasta.fai" - -# ## samtools FAI -# ## population size -# ## mutation rate -# ## sample size -# msprime_vcf(fai, 5000, 1e-8, 60) \ No newline at end of file + defopt.run([sv_vcf, msprime_vcf]) \ No newline at end of file diff --git a/tree_generation.py b/tree_generation.py index 68b67f3..8548db0 100644 --- a/tree_generation.py +++ b/tree_generation.py @@ -32,7 +32,7 @@ def msprime_map(df): # pop_size = population size # mut_rate = mutation rate # n = sample size / number of indiv -def msprime_vcf(fai: str, pop_size: int, mut_rate: float, n: int): +def msprime_vcf(*, fai: str, pop_size: int, mut_rate: float, n: int): """ Generate VCF for each chromosome in reference FASTA. @@ -55,7 +55,7 @@ def msprime_vcf(fai: str, pop_size: int, mut_rate: float, n: int): mutated_ts = msprime.sim_mutations( ts, rate=mut_rate, # random_seed=5678, - discrete_genome=False) + discrete_genome=True) ts_chroms = [] -- GitLab From fedeb252b91f53343eef8935cd0dbbf6b106e537 Mon Sep 17 00:00:00 2001 From: "sukanya.denni" <sukanya.denni@inrae.fr> Date: Fri, 17 Nov 2023 10:36:36 +0100 Subject: [PATCH 04/25] clean up --- bed2vcf.py | 1 - 1 file changed, 1 deletion(-) diff --git a/bed2vcf.py b/bed2vcf.py index 4f04fba..09add71 100644 --- a/bed2vcf.py +++ b/bed2vcf.py @@ -1,7 +1,6 @@ import pandas as pd import re, random, json, os import numpy as np - from Bio import SeqIO from Bio.Seq import Seq -- GitLab From c2ab9eebfd6c4637e019ad376f4d96959f12e952 Mon Sep 17 00:00:00 2001 From: "sukanya.denni" <sukanya.denni@inrae.fr> Date: Fri, 17 Nov 2023 10:37:14 +0100 Subject: [PATCH 05/25] add multiprocessing, change to argparse --- main.py | 30 ------------------------------ parsing.py | 36 ++++++++++++++++++++++++++++++++++++ tree_generation.py | 30 +++++++++++++++++++----------- 3 files changed, 55 insertions(+), 41 deletions(-) delete mode 100644 main.py create mode 100644 parsing.py diff --git a/main.py b/main.py deleted file mode 100644 index 991a5ce..0000000 --- a/main.py +++ /dev/null @@ -1,30 +0,0 @@ -from bed2vcf import read_fa, get_seq -from fusion import * -from tree_generation import msprime_vcf -import defopt - -## n : length of variant -def sv_vcf(*, bed: str, vcf: str, fasta: str, outName: str, header: int): - """ - Create a VCF with structural variants. - - :parameter bed: BED file of structural variants to include - :parameter vcf: VCF generated with msprime --> tree generation script - :parameter fasta: Reference FASTA for the VCF and the variants - :parameter outName: Output name - :parameter header: vcf header size - - """ - bed_df = replace_bed_col(bed, vcf) - vcf_df=read_vcf(vcf, header) - fa = read_fa(fasta) - - get_seq(vcf_df, bed_df, fa, outName) - - header_file="results/vcf_header.txt" - content_file = "results/" + outName + ".vcf" - merged_file="results/" + outName + "_final.vcf" - merge_final_vcf(header_file, content_file, merged_file) - -if __name__ == '__main__': - defopt.run([sv_vcf, msprime_vcf]) \ No newline at end of file diff --git a/parsing.py b/parsing.py new file mode 100644 index 0000000..3be4bf7 --- /dev/null +++ b/parsing.py @@ -0,0 +1,36 @@ +from bed2vcf import read_fa, get_seq +from fusion import * +import argparse +from multiprocessing import Process + +## n : length of variant +def sv_vcf(bed, vcf, fasta, outName, header): + bed_df = replace_bed_col(bed, vcf) + if header != None: + vcf_df=read_vcf(vcf, header) + else: + vcf_df=read_vcf(vcf) + fa = read_fa(fasta) + + get_seq(vcf_df, bed_df, fa, outName) + + header_file="results/vcf_header.txt" + content_file = "results/" + outName + ".vcf" + merged_file="results/" + outName + "_final.vcf" + merge_final_vcf(header_file, content_file, merged_file) + +# parse arguments +parser = argparse.ArgumentParser(description='create final VCF') +parser.add_argument('bed', type=str, help='BED file of structural variants to include') +parser.add_argument('vcf', type=str, help='VCF generated with msprime --> tree generation script') +parser.add_argument('fasta', type=str, help='Reference FASTA for the VCF and the variants') +parser.add_argument('outName', type=str, help='Output name') +parser.add_argument('--header', type=int, help='vcf header size') + +if __name__ == '__main__': + args = parser.parse_args() + d = vars(args) + li = list(d.values()) + p = Process(target=sv_vcf, args=(li)) + p.start() + p.join() \ No newline at end of file diff --git a/tree_generation.py b/tree_generation.py index 8548db0..8d5eea5 100644 --- a/tree_generation.py +++ b/tree_generation.py @@ -1,6 +1,7 @@ import pandas as pd -import math, msprime +import math, msprime, argparse import numpy as np +from multiprocessing import Process def chrom_pos(df): l=df["length"].to_list() @@ -32,15 +33,8 @@ def msprime_map(df): # pop_size = population size # mut_rate = mutation rate # n = sample size / number of indiv -def msprime_vcf(*, fai: str, pop_size: int, mut_rate: float, n: int): - """ - Generate VCF for each chromosome in reference FASTA. - - :parameter fai: FAI samtools index of reference FASTA - :parameter pop_size: population size (Ne) - :parameter mut_rate: mutation rate (µ) - :parameter n: sample size - """ +def msprime_vcf(fai, pop_size, mut_rate, n): + df = pd.read_table(fai, header=None, usecols=[0,1], names =["name", "length"]) rate_map=msprime_map(df) @@ -85,4 +79,18 @@ def msprime_vcf(*, fai: str, pop_size: int, mut_rate: float, n: int): vcf_file.write(t) vcf_file.close() - print(len(ts_chroms)) \ No newline at end of file + print(len(ts_chroms)) + +parser = argparse.ArgumentParser(description='Generate VCF for each chromosome in reference FASTA.') +parser.add_argument('fai', type=str, help='FAI samtools index of reference FASTA') +parser.add_argument('pop_size', type=int, help='population size (Ne)') +parser.add_argument('mut_rate', type=float, help='mutation rate (µ)') +parser.add_argument('n', type=int, help='sample size') + +if __name__ == '__main__': + args = parser.parse_args() + d = vars(args) + li = list(d.values()) + p = Process(target=msprime_vcf, args=(li)) + p.start() + p.join() \ No newline at end of file -- GitLab From af19cb458d6a75d64049dd9ae4d64b96f4a4bcbf Mon Sep 17 00:00:00 2001 From: "sukanya.denni" <sukanya.denni@inrae.fr> Date: Fri, 17 Nov 2023 14:30:35 +0100 Subject: [PATCH 06/25] modified parsing --- parsing.py => variants_generation.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) rename parsing.py => variants_generation.py (65%) diff --git a/parsing.py b/variants_generation.py similarity index 65% rename from parsing.py rename to variants_generation.py index 3be4bf7..27af37e 100644 --- a/parsing.py +++ b/variants_generation.py @@ -21,10 +21,10 @@ def sv_vcf(bed, vcf, fasta, outName, header): # parse arguments parser = argparse.ArgumentParser(description='create final VCF') -parser.add_argument('bed', type=str, help='BED file of structural variants to include') -parser.add_argument('vcf', type=str, help='VCF generated with msprime --> tree generation script') -parser.add_argument('fasta', type=str, help='Reference FASTA for the VCF and the variants') -parser.add_argument('outName', type=str, help='Output name') +parser.add_argument('-b','--bed', type=str, required = True, help='BED file of structural variants to include') +parser.add_argument('-v', '--vcf', type=str, required = True, help='VCF generated with msprime --> tree generation script') +parser.add_argument('-fa', '--fasta', type=str, required = True, help='Reference FASTA for the VCF and the variants') +parser.add_argument('-o', '--outName', type=str, required = True, help='Output name') parser.add_argument('--header', type=int, help='vcf header size') if __name__ == '__main__': -- GitLab From 58b5464d561697f70f683a881646d7dba2bc0702 Mon Sep 17 00:00:00 2001 From: "sukanya.denni" <sukanya.denni@inrae.fr> Date: Fri, 17 Nov 2023 14:30:50 +0100 Subject: [PATCH 07/25] added output directory parameter --- tree_generation.py | 62 +++++++++++++++++++++++++++++++--------------- 1 file changed, 42 insertions(+), 20 deletions(-) diff --git a/tree_generation.py b/tree_generation.py index 8d5eea5..223b71d 100644 --- a/tree_generation.py +++ b/tree_generation.py @@ -1,5 +1,5 @@ import pandas as pd -import math, msprime, argparse +import math, msprime, argparse, os import numpy as np from multiprocessing import Process @@ -29,11 +29,41 @@ def msprime_map(df): rate_map=msprime.RateMap(position=map_positions, rate=rates) return(rate_map) +### ts_chroms = list of trees for each chromosome +### names = list of chromosome name +def output_files(ts_chroms, names, outdir="results"): + if not os.path.exists(outdir): + os.mkdir(outdir) + + ## write vcf + filename = outdir + "/msprime_simulation.vcf" + for i in range(len(ts_chroms)): + txt = ts_chroms[i].as_vcf(contig_id=names[i]) + t = ''.join(txt.splitlines(keepends=True)[6:]) + # filename = 'tmp_vcf/output' + str(i) + ".vcf" + if i==0: + header_top = ''.join(txt.splitlines(keepends=True)[:4]) + header_bot = ''.join(txt.splitlines(keepends=True)[4:6]) + with open(filename, "w") as vcf_file: + vcf_file.write(t) + # ts_chroms[i].write_vcf(vcf_file, contig_id=names[i]) + else: + header_top += txt.splitlines(keepends=True)[3] + + with open(filename, "a") as vcf_file: + vcf_file.write(t) + vcf_file.close() + ## write header to file + header = header_top + header_bot + with open(outdir + "/header.vcf", "w") as head: + head.write(header) + head.close() + # input_fai = samtools FAI of FASTA where variants will be generated # pop_size = population size # mut_rate = mutation rate # n = sample size / number of indiv -def msprime_vcf(fai, pop_size, mut_rate, n): +def msprime_vcf(fai, pop_size, mut_rate, n, outdir): df = pd.read_table(fai, header=None, usecols=[0,1], names =["name", "length"]) rate_map=msprime_map(df) @@ -63,29 +93,21 @@ def msprime_vcf(fai, pop_size, mut_rate, n): start, end = chrom_positions[j: j + 2] chrom_ts = mutated_ts.keep_intervals([[start, end]], simplify=False).trim() ts_chroms.append(chrom_ts) - print(chrom_ts.sequence_length) names=df["name"].to_list() - for i in range(len(ts_chroms)): - filename="msprime_output_chr.vcf" - if i==0: - with open(filename, "w") as vcf_file: - ts_chroms[i].write_vcf(vcf_file, contig_id=names[i]) - else: - txt = ts_chroms[i].as_vcf(contig_id=names[i]) - t = ''.join(txt.splitlines(keepends=True)[6:]) - with open(filename, "a") as vcf_file: - vcf_file.write(t) - vcf_file.close() - - print(len(ts_chroms)) + ## create files + if outdir == None: + output_files(ts_chroms, names) + else: + output_files(ts_chroms, names, outdir) parser = argparse.ArgumentParser(description='Generate VCF for each chromosome in reference FASTA.') -parser.add_argument('fai', type=str, help='FAI samtools index of reference FASTA') -parser.add_argument('pop_size', type=int, help='population size (Ne)') -parser.add_argument('mut_rate', type=float, help='mutation rate (µ)') -parser.add_argument('n', type=int, help='sample size') +parser.add_argument('-fai', '--fai', type=str, required = True, help='FAI samtools index of reference FASTA') +parser.add_argument('-p', '--popSize', type=int, required = True, help='population size (Ne)') +parser.add_argument('-r', '--rate', type=float, required = True, help='mutation rate (µ)') +parser.add_argument('-n', '--sampleSize', type=int, required = True, help='sample size') +parser.add_argument('-o', '--outDir', type=str, help='output directory') if __name__ == '__main__': args = parser.parse_args() -- GitLab From d70fee9bd97399a651ae92973628adf2c1435f4a Mon Sep 17 00:00:00 2001 From: "sukanya.denni" <sukanya.denni@inrae.fr> Date: Fri, 17 Nov 2023 17:06:33 +0100 Subject: [PATCH 08/25] script for variant creation --- VISOR_random_bed/randombed.py | 75 +++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) create mode 100644 VISOR_random_bed/randombed.py diff --git a/VISOR_random_bed/randombed.py b/VISOR_random_bed/randombed.py new file mode 100644 index 0000000..6d49325 --- /dev/null +++ b/VISOR_random_bed/randombed.py @@ -0,0 +1,75 @@ +import pandas as pd +import yaml, random + +def read_yaml(f): + stream = open(f, 'r') + d = yaml.safe_load(stream) + stream.close() + if sum(d.values()) != 100: + raise ValueError("Sum of values must be 100") + + return(d) + +infos = {'deletion': None, 'insertion': None, 'inversion': None, + 'tandem duplication': 2, 'inverted tandem duplication': 2, + 'translocation copy-paste': 0, 'translocation cut-paste': 0, 'reciprocal translocation': 0, 'SNP': None} + +# generate info field +def select_chr(fai): + df = pd.read_table(fai, header=None, usecols=[0,1], names =["name", "length"]) + names=df["name"].to_list() + chr = random.choice(names) + df = pd.pivot_table(df, columns='name') + len = df.iloc[0][chr] + pos = random.randrange(len) + li = [chr, pos] + return(li) + +def select_orientation(li): + li.append(random.choice(["forward", "reverse"])) + return(li) + + +def generate_type(nvar, yml, fai): + d = read_yaml(yml) + print(d) + ## list for variant type and info + vartype = [] + infos = [] + ## filter dict to keep value != 0 + dict = {k: v for k, v in d.items() if v != 0} + for k in dict: + perc = dict[k] + n = round(perc*nvar/100) + if k == "tandem duplication": + info = 2 + elif k == 'translocation copy-paste' or k == 'translocation cut-paste': + info = select_chr(fai) + info = select_orientation(info) + elif k == "reciprocal translocation": + info = select_chr(fai) + info = select_orientation(info) + info = select_orientation(info) + else: + info = None + + vartype.extend([k]*n) + infos.extend([info]*n) + for_df = {'variant': vartype, 'info': infos} + + df = pd.DataFrame(data = for_df) + df = df.sample(frac = 1) + print(df) + + +# tandem duplication: 0 +# inverted tandem duplication: 0 +# translocation copy-paste: 0 +# translocation cut-paste : 0 +# reciprocal translocation: 0 + + + +fai = "/home/sukanya/tests/02_data/Sibirica_v1.0.fa.fai" +# select_chr(fai) +generate_type(12, "visor_sv_type.yaml", fai) -- GitLab From 6dd472c77fa7f6314a5b4161d06840b160e4512b Mon Sep 17 00:00:00 2001 From: "sukanya.denni" <sukanya.denni@inrae.fr> Date: Mon, 20 Nov 2023 11:01:12 +0100 Subject: [PATCH 09/25] add variant type generation script --- randombed.py | 80 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) create mode 100644 randombed.py diff --git a/randombed.py b/randombed.py new file mode 100644 index 0000000..ee5fb1b --- /dev/null +++ b/randombed.py @@ -0,0 +1,80 @@ +import pandas as pd +import yaml, random, re + +## read yaml file containing variants informations +def read_yaml(f): + stream = open(f, 'r') + d = yaml.safe_load(stream) + stream.close() + if sum(d.values()) != 100: + raise ValueError("Sum of values must be 100") + return(d) + +infos_dict = {'deletion': None, 'insertion': None, 'inversion': None, 'SNP': None, + 'tandem duplication': 2, 'inverted tandem duplication': 2, + 'translocation copy-paste': 0, 'translocation cut-paste': 0, 'reciprocal translocation': 0} + +## generate info field for translocations +# randomly select chromosome to add sequence to +def select_chr(fai): + df = pd.read_table(fai, header=None, usecols=[0,1], names =["name", "length"]) + names=df["name"].to_list() + chr = random.choice(names) + df = pd.pivot_table(df, columns='name') + len = df.iloc[0][chr] + pos = random.randrange(len) + li = [chr, pos] + return(li) + +# randomly select sequence orientation +def select_orientation(li): + li.append(random.choice(["forward", "reverse"])) + return(li) + +## create dataframe with all the variants to simulate +def generate_type(nvar, yml, fai): + d = read_yaml(yml) + print(d) + ## list for variant type and info + vartype = [] + infos = [] + ## filter dict to keep value != 0 + dict = {k: v for k, v in d.items() if v != 0} + for k in dict: + # generate n variants + perc = dict[k] + n = round(perc*nvar/100) + # add info field + if re.search("translocation", k): + for i in range(n): + info = select_chr(fai) + info = select_orientation(info) + if k == "reciprocal translocation": + info = select_orientation(info) + infos.extend([info]) + i += 1 + else: + info = infos_dict[k] + infos.extend([info]*n) + vartype.extend([k]*n) + + for_df = {'variant': vartype, 'info': infos} + + df = pd.DataFrame(data = for_df) + # shuffle values in dataframe + df = df.sample(frac = 1) + # print(df) + return(df) + + +# tandem duplication: 0 +# inverted tandem duplication: 0 +# translocation copy-paste: 0 +# translocation cut-paste : 0 +# reciprocal translocation: 0 + + + +fai = "/home/sukanya/tests/02_data/Sibirica_v1.0.fa.fai" +# select_chr(fai) +generate_type(12, "visor_sv_type.yaml", fai) -- GitLab From 21c63390295a7178cb38f4125c39f12a9df10521 Mon Sep 17 00:00:00 2001 From: "sukanya.denni" <sukanya.denni@inrae.fr> Date: Mon, 20 Nov 2023 11:04:46 +0100 Subject: [PATCH 10/25] updated script --- VISOR_random_bed/randombed.py | 49 +++++++++++++++++++---------------- bed2vcf.py | 46 ++++++++++++-------------------- 2 files changed, 44 insertions(+), 51 deletions(-) diff --git a/VISOR_random_bed/randombed.py b/VISOR_random_bed/randombed.py index 6d49325..788577a 100644 --- a/VISOR_random_bed/randombed.py +++ b/VISOR_random_bed/randombed.py @@ -1,20 +1,21 @@ import pandas as pd -import yaml, random +import yaml, random, re +## read yaml file containing variants informations def read_yaml(f): stream = open(f, 'r') d = yaml.safe_load(stream) stream.close() if sum(d.values()) != 100: raise ValueError("Sum of values must be 100") - return(d) -infos = {'deletion': None, 'insertion': None, 'inversion': None, +infos_dict = {'deletion': None, 'insertion': None, 'inversion': None, 'SNP': None, 'tandem duplication': 2, 'inverted tandem duplication': 2, - 'translocation copy-paste': 0, 'translocation cut-paste': 0, 'reciprocal translocation': 0, 'SNP': None} + 'translocation copy-paste': 0, 'translocation cut-paste': 0, 'reciprocal translocation': 0} -# generate info field +## generate info field for translocations +# randomly select chromosome to add sequence to def select_chr(fai): df = pd.read_table(fai, header=None, usecols=[0,1], names =["name", "length"]) names=df["name"].to_list() @@ -25,11 +26,12 @@ def select_chr(fai): li = [chr, pos] return(li) +# randomly select sequence orientation def select_orientation(li): li.append(random.choice(["forward", "reverse"])) return(li) - +## create dataframe with all the variants to simulate def generate_type(nvar, yml, fai): d = read_yaml(yml) print(d) @@ -39,27 +41,30 @@ def generate_type(nvar, yml, fai): ## filter dict to keep value != 0 dict = {k: v for k, v in d.items() if v != 0} for k in dict: + # generate n variants perc = dict[k] n = round(perc*nvar/100) - if k == "tandem duplication": - info = 2 - elif k == 'translocation copy-paste' or k == 'translocation cut-paste': - info = select_chr(fai) - info = select_orientation(info) - elif k == "reciprocal translocation": - info = select_chr(fai) - info = select_orientation(info) - info = select_orientation(info) + # add info field + if re.search("translocation", k): + for i in range(n): + info = select_chr(fai) + info = select_orientation(info) + if k == "reciprocal translocation": + info = select_orientation(info) + infos.extend([info]) + i += 1 else: - info = None - + info = infos_dict[k] + infos.extend([info]*n) vartype.extend([k]*n) - infos.extend([info]*n) + for_df = {'variant': vartype, 'info': infos} df = pd.DataFrame(data = for_df) + # shuffle values in dataframe df = df.sample(frac = 1) - print(df) + # print(df) + return(df) # tandem duplication: 0 @@ -70,6 +75,6 @@ def generate_type(nvar, yml, fai): -fai = "/home/sukanya/tests/02_data/Sibirica_v1.0.fa.fai" -# select_chr(fai) -generate_type(12, "visor_sv_type.yaml", fai) +# fai = "/home/sukanya/tests/02_data/Sibirica_v1.0.fa.fai" +# # select_chr(fai) +# generate_type(12, "visor_sv_type.yaml", fai) diff --git a/bed2vcf.py b/bed2vcf.py index 09add71..ee40bf2 100644 --- a/bed2vcf.py +++ b/bed2vcf.py @@ -38,14 +38,7 @@ def set_ref_alt(ref, alt, row, vcf_df): vcf_df.at[row, "REF"] = ref vcf_df.at[row, "ALT"] = alt -def get_random_len(svtype): - # if svtype == "SNP": - # return(1) - # else: - # d = {"deletion": "DEL", "insertion" : "INS", "inversion" : "INV", - # "CNV", "duplication" : "DUP", "inverted tandem duplication" : "DUP"} - # svtype = d[svtype] - +def get_random_len(svtype): # INS, DEL, DUP, CNV, INV df = pd.read_csv("sv_distributions/size_distrib" + svtype + ".tsv", sep="\t") pb = df["pb"].tolist() @@ -56,31 +49,28 @@ def get_random_len(svtype): s = np.random.uniform(interval[0], interval[1], 1).round() return(int(s[0])) -# get the sequence of each variants in BED -# output file is a VCF -# (currently, not all variant supported by VISOR are included) -# TODO : missing VISOR variant type : SNP, MNP and 4 types of tandem repeat - +### get the sequence of each variants in BED, output VCF # vcf_df : msprime VCF # bed_def : VISOR BED # fa_dict : FASTA where variants will be generated def get_seq(vcf_df, bed_df, fa_dict, output_file): - print(bed_df) -# number of variants + # number of variants n = len(vcf_df) for i in range(n): sv_info = bed_df.iloc[i] - chr_name = sv_info[0] - start = sv_info[1] + ## nom du chr pris dans le VCF + chr_name = vcf_df.iloc[i][0] + ## position du variant pris dans le VCF + start = vcf_df.iloc[i][1] start = start-1 # pour ajuster à l'index python - sv_type = sv_info[3] + ## type de SV à générer + sv_type = sv_info[0] fasta_seq = fa_dict[chr_name].upper() if sv_type == "SNP": ref = str(fasta_seq.seq[start]) - # alt = sv_info[4] alt = SNP(ref) elif sv_type == "deletion": @@ -103,7 +93,7 @@ def get_seq(vcf_df, bed_df, fa_dict, output_file): end = start + get_random_len("DUP") ref = str(fasta_seq.seq[start]) alt_seq1 = str(fasta_seq.seq[start+1:end]) - cp = int(sv_info[4])-1 + cp = int(sv_info[1])-1 # multiplication pour obtenir les copies alt_seq = alt_seq1*cp if sv_type == "inverted tandem duplication": @@ -117,25 +107,23 @@ def get_seq(vcf_df, bed_df, fa_dict, output_file): end = start + l # informations sur la translocation - trans_info = sv_info[4] - print(trans_info, type(trans_info)) - infos = trans_info.split(":") - trans_start = int(infos[2])-1 + trans_info = sv_info[1] + trans_chr = trans_info[0] + trans_start = trans_info[1]-1 trans_end = trans_start + l - trans_chr = infos[1] - + # translocation réciproque if sv_type == "reciprocal translocation": ref = str(fasta_seq.seq[start:end]) fasta_seq_trans = fa_dict[trans_chr].upper() ref2 = str(fasta_seq_trans.seq[trans_start:trans_end]) - if infos[3] == "reverse": + if trans_info[2] == "reverse": alt = str(fasta_seq.seq[start]) + reverse(str(fasta_seq_trans.seq[trans_start+1:trans_end])) else : alt = str(fasta_seq.seq[start]) + str(fasta_seq_trans.seq[trans_start+1:trans_end]) - if infos[4] == "reverse": + if trans_info[3] == "reverse": alt2 = str(fasta_seq_trans.seq[trans_start]) + reverse(str(fasta_seq.seq[start+1:end])) else: alt2 = str(fasta_seq_trans.seq[trans_start]) + str(fasta_seq.seq[start+1:end]) @@ -148,7 +136,7 @@ def get_seq(vcf_df, bed_df, fa_dict, output_file): # insertion ref2 = str(fasta_seq_trans.seq[trans_start]) - if infos[3] == "reverse": + if trans_info[2] == "reverse": alt2 = ref2 + reverse(str(fasta_seq.seq[start+1:end])) else : alt2 = ref2 + str(fasta_seq.seq[start+1:end]) -- GitLab From e0dee60b3b1a78e75a02b63428bc342915c6dd26 Mon Sep 17 00:00:00 2001 From: "sukanya.denni" <sukanya.denni@inrae.fr> Date: Mon, 20 Nov 2023 11:05:11 +0100 Subject: [PATCH 11/25] changed file reading --- fusion.py | 64 +++++++++++++++++++++++++++++-------------------------- 1 file changed, 34 insertions(+), 30 deletions(-) diff --git a/fusion.py b/fusion.py index 68a138a..b66f956 100644 --- a/fusion.py +++ b/fusion.py @@ -4,39 +4,43 @@ import io, os, shutil import pandas as pd -def read_vcf(input_file, header_size=6): - with open(input_file, "r") as txt: - t = txt.read() - txt.close() +# def read_vcf(input_file, header_size=6): +# with open(input_file, "r") as txt: +# t = txt.read() +# txt.close() - # write header for later - if not os.path.exists("results"): - os.mkdir("results") - f = open("results/vcf_header.txt", "w") - f.write(''.join(t.splitlines(keepends=True)[:header_size])) - f.close() +# # write header for later +# if not os.path.exists("results"): +# os.mkdir("results") +# f = open("results/vcf_header.txt", "w") +# f.write(''.join(t.splitlines(keepends=True)[:header_size])) +# f.close() - # remove VCF header for dataframe - t = ''.join(t.splitlines(keepends=True)[header_size-1:]) - df = pd.read_table(io.StringIO(t)) - df = df.rename(columns={"#CHROM" : "CHROM"}) - return(df) +# # remove VCF header for dataframe +# t = ''.join(t.splitlines(keepends=True)[header_size-1:]) +# df = pd.read_table(io.StringIO(t)) +# df = df.rename(columns={"#CHROM" : "CHROM"}) +# return(df) -def read_BED(input_file): - colnames=["chr", "start", "end", "type","info","breakpoint"] - df = pd.read_table(input_file, header=None, names=colnames) +def get_vcf(input_file): + df = pd.read_table(input_file, sep="\t") return(df) -def replace_bed_col(input_BED, input_VCF): - vcf = read_vcf(input_VCF) - bed = read_BED(input_BED) - if len(bed) == len(vcf): - bed["chr"] = vcf["CHROM"] - bed["start"] = vcf["POS"] - return(bed) +# def read_BED(input_file): +# colnames=["chr", "start", "end", "type","info","breakpoint"] +# df = pd.read_table(input_file, header=None, names=colnames) +# return(df) + +# def replace_bed_col(input_BED, input_VCF): +# vcf = read_vcf(input_VCF) +# bed = read_BED(input_BED) +# if len(bed) == len(vcf): +# bed["chr"] = vcf["CHROM"] +# bed["start"] = vcf["POS"] +# return(bed) -def merge_final_vcf(header_file, content_file, merged_file): - with open(merged_file, 'wb') as outfile: - for filename in [header_file, content_file]: - with open(filename, 'rb') as infile: - shutil.copyfileobj(infile, outfile) \ No newline at end of file +# def merge_final_vcf(header_file, content_file, merged_file): +# with open(merged_file, 'wb') as outfile: +# for filename in [header_file, content_file]: +# with open(filename, 'rb') as infile: +# shutil.copyfileobj(infile, outfile) \ No newline at end of file -- GitLab From 6591076646e26e52861f497937bf41031cb45c74 Mon Sep 17 00:00:00 2001 From: "sukanya.denni" <sukanya.denni@inrae.fr> Date: Mon, 20 Nov 2023 11:05:24 +0100 Subject: [PATCH 12/25] removed R scripts --- VISOR_random_bed/random_bed.sh | 8 -- VISOR_random_bed/randomregion.r | 222 -------------------------------- 2 files changed, 230 deletions(-) delete mode 100644 VISOR_random_bed/random_bed.sh delete mode 100644 VISOR_random_bed/randomregion.r diff --git a/VISOR_random_bed/random_bed.sh b/VISOR_random_bed/random_bed.sh deleted file mode 100644 index 67a2a5c..0000000 --- a/VISOR_random_bed/random_bed.sh +++ /dev/null @@ -1,8 +0,0 @@ -### create random BED -refFAI="/home/sukanya/tests/02_data/hackathon_Ztritici/CHR8/ztIPO323.chr8.fasta.fai" -GENOME="/home/sukanya/tests/02_data/hackathon_Ztritici/CHR8/ztIPO323.chr8.fasta" -VISOR_script="randomregion.r" - -# samtools faidx $GENOME -cut -f1,2 $refFAI > chrom.dim.tsv -Rscript $VISOR_script -d chrom.dim.tsv -n 10 -l 2 -v 'SNP,insertion,deletion,inversion' -r '70:10:10:10' -g $GENOME | sortBed > HACk.random.bed \ No newline at end of file diff --git a/VISOR_random_bed/randomregion.r b/VISOR_random_bed/randomregion.r deleted file mode 100644 index a6ca763..0000000 --- a/VISOR_random_bed/randomregion.r +++ /dev/null @@ -1,222 +0,0 @@ -# if (!requireNamespace("BiocManager", quietly = TRUE)) -# install.packages("BiocManager",repos='http://cran.us.r-project.org') -# if (!requireNamespace("optparse", quietly = TRUE)) -# install.packages("optparse",repos='http://cran.us.r-project.org') -# if (!requireNamespace("data.table", quietly = TRUE)) -# install.packages("data.table",repos='http://cran.us.r-project.org') -# if (!requireNamespace("regioneR", quietly = TRUE)) -# BiocManager::install("regioneR") -# if (!requireNamespace("Rsamtools", quietly = TRUE)) -# BiocManager::install("Rsamtools") - - -suppressPackageStartupMessages(library(optparse)) -suppressPackageStartupMessages(library(data.table)) -suppressPackageStartupMessages(library(regioneR)) -suppressPackageStartupMessages(library(Rsamtools)) - -option_list = list( - make_option(c("-d", "--dimensions"), action="store", type='character', help="TSV file with chromosome dimensions (as result from 'cut -f1,2 genome.fa.fai') [required]"), - make_option(c("-n", "--number"), action="store", default=100, type='numeric', help="number of intervals [100]"), - make_option(c("-l", "--length"), action="store", default=50000, type='numeric', help="mean intervals length [50000]"), - make_option(c("-s", "--standarddev"), action="store", default=0, type='numeric', help="standard deviation for intervals length [0]"), - make_option(c("-x", "--exclude"), action="store", default=NULL, type='character', help="exclude regions in BED [optional]"), - make_option(c("-v", "--variants"), action="store", type='character', help="variants types (-v 'deletion,tandem duplication,inversion') [required]"), - make_option(c("-r", "--ratio"), action="store", type='character', help="variants proportions (-r '30:30:40')) [required]"), - make_option(c("-i", "--idhaplo"), action="store", type='numeric', default=1, help="haplotype number [1]"), - make_option(c("-m", "--microsatellites"), action="store", type='character', default=NULL, help="BED file with microsatellites (ucsc format) [optional]"), - make_option(c("-g", "--genome"), action="store", type="character", default=NULL, help="FASTA genome [optional]") -) - -opt = parse_args(OptionParser(option_list=option_list)) - -possiblevariants<-c('deletion', 'insertion', 'inversion', 'tandem duplication', 'inverted tandem duplication', 'translocation copy-paste', 'translocation cut-paste' , 'reciprocal translocation', 'SNP', 'MNP', 'tandem repeat contraction', 'tandem repeat expansion', 'perfect tandem repetition', 'approximate tandem repetition') -possiblemicro<-c('AAC','AAG','AAT','AC', 'ACA','ACC','ACT','AG','AGA','AGC','AGG','AT','ATA','ATC','ATG','ATT','CA','CAA','CAC','CAG','CAT','CCA','CCG','CCT','CT','CTA','CTC','CTG','CTT','GA','GAA','GAT','GCA','GCC','GGA','GGC','GT','GTG','GTT','TA','TAA','TAC','TAG','TAT','TC','TCA','TCC','TCT','TG','TGA','TGC','TGG','TGT','TTA','TTC','TTG') - -if (is.null(opt$dimensions)) { - stop('-d/--dimensions TSV file is required') -} else { - genome<-fread(file.path(opt$dimensions), sep='\t', header = F) -} - -if (is.null(opt$exclude)) { - exclude<-NULL -} else { - exclude<-fread(file.path(opt$exclude), sep='\t', header=F) -} - -if (is.null(opt$variants)) { - stop('variants in -v/--variants are required') -} - -if (is.null(opt$ratio)) { - stop('variants ratios in -r/--ratio are required') -} - -variants<-unlist(strsplit(opt$variants, ',')) - -if (!all(variants %in% possiblevariants)) { - stop('accepted variants are: ', paste(possiblevariants, collapse=', ')) -} - -testt<-c("tandem repeat contraction", "tandem repeat expansion") - -if (any(testt %in% variants)) { - if (is.null(opt$microsatellites)) { - stop('when adding contractions/expansions of known microsatellites, a BED file specifying known microsatellites must be given') - } else { - microbed<-fread(file.path(opt$microsatellites), sep='\t', header=FALSE) - if (!is.null(exclude)) { - exclude<-data.frame(rbind(exclude[,1:3],data.frame(V1=microbed$V1, V2=microbed$V2-1, V3=microbed$V3+1))) - } else { - exclude<-data.frame(V1=microbed$V1, V2=microbed$V2-2, V3=microbed$V3+2) - } - } -} - -testv<-c("SNP", "MNP") - -if (any(testv %in% variants)) { - if (is.null(opt$genome)) { - stop('when adding SNPs and MNPs, a genome FASTA must be given') - } else { - if (!file.exists(file.path(paste0(opt$genome, '.fai')))) { - indexFa(file.path(opt$genome)) - } - } -} - -number<-as.numeric(unlist(strsplit(opt$ratio, ':'))) - -if (cumsum(number)[length(cumsum(number))] != 100) { - stop('sum of variant ratios must be 100') -} - -if (length(variants) != length(number)) { - stop('for each variant a ratio must be specified') -} - -varwithproportions<-rep(variants, (opt$number/100)*number) - -#the number of regions here is all minus the number of repeat contractions/expansions for which we sample from known regions -keepdist<-varwithproportions[which(varwithproportions %in% testt)] -dft<-data.frame(matrix(NA, nrow = length(keepdist), ncol = 6), stringsAsFactors=FALSE) -colnames(dft)<-c("chromosome","start","end","type","info","breakseqlen") -if (length(keepdist) != 0) { - varwithproportions<-varwithproportions[-which(varwithproportions %in% testt)] #remove - randomicro<-microbed[sample(nrow(microbed), length(keepdist)),] - for (i in 1:nrow(randomicro)) { - dft$chromosome[i]<-randomicro$V1[i] - dft$start[i]<-as.numeric(randomicro$V2[i]) - dft$end[i]<-as.numeric(randomicro$V3[i]) - dft$type[i]<-keepdist[i] - motif<-unlist(strsplit(randomicro$V4[i],'x'))[2] - number_<-as.numeric(unlist(strsplit(randomicro$V4[i],'x'))[1]) - minimum<-1 - if (keepdist[i] == 'tandem repeat contraction') { - maximum<-number_-1 - } else {#is expansion - maximum<-500 - } - dft$info[i]<-paste(motif, sample(minimum:maximum,1),sep=':') - dft$breakseqlen[i]<-0 - } -} - -regions<-createRandomRegions(length(varwithproportions), length.mean=opt$length, length.sd=opt$standarddev, genome=genome, mask=exclude, non.overlapping=TRUE) - -# just in case we need another list of regions to exclude for translocation -newexclude<-data.frame(rbind(exclude[,1:3], cbind(as.character(seqnames(regions)), as.numeric(start(regions)-2), as.numeric(end(regions)+2)))) - -df <- data.frame(chromosome=seqnames(regions),start=start(regions),end=end(regions), type=varwithproportions, stringsAsFactors = FALSE) - -info<-rep('INFO',nrow(df)) -breakseqlen<-rep(0, nrow(df)) - -for(i in (1:nrow(df))) { - if (df$type[i] == 'deletion') { - info[i]<-'None' - breakseqlen[i]<-sample(0:10,1) - } else if (df$type[i] == 'tandem duplication') { - info[i] <- '2' - breakseqlen[i]<-sample(0:10,1) - } else if (df$type[i] == 'inversion'){ - info[i] <- 'None' - breakseqlen[i]<-sample(0:10,1) - } else if (df$type[i] == 'inverted tandem duplication') { - info[i] <- '2' - } else if (df$type[i] == 'insertion') { - df$start[i]<-df$end[i]-1 - num<-round(rnorm(1, mean=opt$length, sd=opt$standarddev)) #adapt mean and stdev for insertions to those specified by the user - alphabet<-c('A', 'T', 'C', 'G') - motif<-paste(sample(alphabet, num, replace = T), collapse='') - info[i]<-motif - breakseqlen[i]<-sample(0:10,1) - } else if (df$type[i] == 'perfect tandem repetition') { - df$start[i]<-df$end[i]-1 - motif<-sample(possiblemicro,1) - num<-sample(6:500,1) - info[i]<-paste(motif,num,sep=':') - breakseqlen[i]<-0 - } else if (df$type[i] == 'approximate tandem repetition') { - df$start[i]<-df$end[i]-1 - motif<-sample(possiblemicro,1) - num<-sample(6:500,1) - err<-sample(1:round(num/3),1) - info[i]<-paste(motif,num,err,sep=':') - breakseqlen[i]<-0 - } else if (df$type[i] == 'SNP') { - df$start[i]<-df$end[i]-1 - tog<-makeGRangesFromDataFrame(data.frame(chromosome=df$chromosome[i], start=df$end[i], end=df$end[i])) - nuc<-as.character(scanFa(opt$genome,tog))[[1]] - if (nuc == 'N') { - alphabet<-c('A', 'T', 'C', 'G', 'N') - } else { - alphabet<-c('A', 'T', 'C', 'G') - } - alphabet<-alphabet[-which(alphabet == nuc)] - info[i]<-sample(alphabet,1) - breakseqlen[i]<-0 - } else if(df$type[i] == 'MNP') { - smallseq<-sample(2:10,1) - df$start[i]<-df$end[i]-smallseq - tog<-makeGRangesFromDataFrame(data.frame(chromosome=df$chromosome[i], start=df$start[i], end=df$end[i])) - nucseq<-as.character(scanFa(opt$genome,tog))[[1]] - for (l in 1:nchar(nucseq)) { - nuc<-substr(nucseq, l, l) - if (nuc == 'N') { - alphabet<-c('A', 'T', 'C', 'G', 'N') - } else { - alphabet<-c('A', 'T', 'C', 'G') - } - alphabet<-alphabet[-which(alphabet == nuc)] - substr(nucseq, l, l)<-sample(alphabet,1) - } - info[i]<-nucseq - breakseqlen[i]<-0 - } else if (df$type[i] == 'translocation cut-paste' | df$type[i] == 'translocation copy-paste') { - newregion<-createRandomRegions(1, length.mean=opt$length, length.sd=opt$standarddev, genome=genome, mask=newexclude, non.overlapping=TRUE) - chromosome<-as.character(seqnames(newregion)) - start<-as.numeric(start(newregion)) - end<-as.numeric(end(newregion)) - orientation<-sample(c('forward', 'reverse'),1) - newexclude<-data.frame(rbind(newexclude,c(chromosome, start-2, end+2))) - info[i]<-paste(paste0('h',opt$idhaplo),chromosome,start,orientation,sep=':') - breakseqlen[i]<-sample(0:10,1) - } else { - newregion<-createRandomRegions(1, length.mean=opt$length, length.sd=opt$standarddev, genome=genome, mask=newexclude, non.overlapping=TRUE) - chromosome<-as.character(seqnames(newregion)) - start<-as.numeric(start(newregion)) - end<-as.numeric(end(newregion)) - orientation1<-sample(c('forward', 'reverse'),1) - orientation2<-sample(c('forward', 'reverse'),1) - newexclude<-data.frame(rbind(newexclude,c(chromosome, start-2, end+2))) - info[i]<-paste(paste0('h',opt$idhaplo+1),chromosome,start,orientation1, orientation2, sep=':') - breakseqlen[i]<-sample(0:10,1) - } -} - -final<-data.frame(df,info,breakseqlen, stringsAsFactors = FALSE) - -fwrite(rbind(final,dft),file = '',quote = FALSE, col.names = FALSE, row.names = FALSE, sep='\t') - -- GitLab From d6991c8817655d4f0c426522684edfd7500c2e48 Mon Sep 17 00:00:00 2001 From: "sukanya.denni" <sukanya.denni@inrae.fr> Date: Mon, 20 Nov 2023 11:05:45 +0100 Subject: [PATCH 13/25] cleaned up --- VISOR_random_bed/randombed.py | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/VISOR_random_bed/randombed.py b/VISOR_random_bed/randombed.py index 788577a..47bcd96 100644 --- a/VISOR_random_bed/randombed.py +++ b/VISOR_random_bed/randombed.py @@ -63,18 +63,4 @@ def generate_type(nvar, yml, fai): df = pd.DataFrame(data = for_df) # shuffle values in dataframe df = df.sample(frac = 1) - # print(df) - return(df) - - -# tandem duplication: 0 -# inverted tandem duplication: 0 -# translocation copy-paste: 0 -# translocation cut-paste : 0 -# reciprocal translocation: 0 - - - -# fai = "/home/sukanya/tests/02_data/Sibirica_v1.0.fa.fai" -# # select_chr(fai) -# generate_type(12, "visor_sv_type.yaml", fai) + return(df) \ No newline at end of file -- GitLab From 284fe58b6a7edb7be13dcc405560367a0ed531b7 Mon Sep 17 00:00:00 2001 From: "sukanya.denni" <sukanya.denni@inrae.fr> Date: Mon, 20 Nov 2023 11:05:58 +0100 Subject: [PATCH 14/25] modified to fit new scripts --- variants_generation.py | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/variants_generation.py b/variants_generation.py index 27af37e..8012b47 100644 --- a/variants_generation.py +++ b/variants_generation.py @@ -1,31 +1,39 @@ from bed2vcf import read_fa, get_seq +from randombed import generate_type from fusion import * import argparse from multiprocessing import Process ## n : length of variant -def sv_vcf(bed, vcf, fasta, outName, header): - bed_df = replace_bed_col(bed, vcf) - if header != None: - vcf_df=read_vcf(vcf, header) - else: - vcf_df=read_vcf(vcf) +def sv_vcf(vcf, fasta, fai, yml, outName): + + # bed_df = replace_bed_col(bed, vcf) + # if header != None: + # vcf_df=read_vcf(vcf, header) + # else: + # vcf_df=read_vcf(vcf) + + vcf_df=get_vcf(vcf) + nvar = len(vcf_df) + bed_df = generate_type(nvar, yml, fai) fa = read_fa(fasta) get_seq(vcf_df, bed_df, fa, outName) - header_file="results/vcf_header.txt" - content_file = "results/" + outName + ".vcf" - merged_file="results/" + outName + "_final.vcf" - merge_final_vcf(header_file, content_file, merged_file) + # header_file="results/vcf_header.txt" + # content_file = "results/" + outName + ".vcf" + # merged_file="results/" + outName + "_final.vcf" + # merge_final_vcf(header_file, content_file, merged_file) # parse arguments parser = argparse.ArgumentParser(description='create final VCF') -parser.add_argument('-b','--bed', type=str, required = True, help='BED file of structural variants to include') +# parser.add_argument('-b','--bed', type=str, required = True, help='BED file of structural variants to include') parser.add_argument('-v', '--vcf', type=str, required = True, help='VCF generated with msprime --> tree generation script') parser.add_argument('-fa', '--fasta', type=str, required = True, help='Reference FASTA for the VCF and the variants') +parser.add_argument('-fai', '--fai', type=str, required = True, help='Samtools index of reference FASTA') +parser.add_argument('-y', '--yaml', type=str, required = True, help='Reference FASTA for the VCF and the variants') parser.add_argument('-o', '--outName', type=str, required = True, help='Output name') -parser.add_argument('--header', type=int, help='vcf header size') +# parser.add_argument('--header', type=int, help='vcf header size') if __name__ == '__main__': args = parser.parse_args() -- GitLab From e7f9197b18f113ce94e93ed02a9836b015c3a2a1 Mon Sep 17 00:00:00 2001 From: "sukanya.denni" <sukanya.denni@inrae.fr> Date: Mon, 20 Nov 2023 11:27:08 +0100 Subject: [PATCH 15/25] moved variable --- bed2vcf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bed2vcf.py b/bed2vcf.py index ee40bf2..313941f 100644 --- a/bed2vcf.py +++ b/bed2vcf.py @@ -111,11 +111,11 @@ def get_seq(vcf_df, bed_df, fa_dict, output_file): trans_chr = trans_info[0] trans_start = trans_info[1]-1 trans_end = trans_start + l + fasta_seq_trans = fa_dict[trans_chr].upper() # translocation réciproque if sv_type == "reciprocal translocation": ref = str(fasta_seq.seq[start:end]) - fasta_seq_trans = fa_dict[trans_chr].upper() ref2 = str(fasta_seq_trans.seq[trans_start:trans_end]) if trans_info[2] == "reverse": -- GitLab From 241c3f3f50af0eefe903d11e4003c7158a2cd127 Mon Sep 17 00:00:00 2001 From: "sukanya.denni" <sukanya.denni@inrae.fr> Date: Tue, 21 Nov 2023 14:57:37 +0100 Subject: [PATCH 16/25] snakemake first draft --- .gitignore | 1 + Snakefile | 105 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 106 insertions(+) create mode 100644 Snakefile diff --git a/.gitignore b/.gitignore index f7f2ef4..103f54c 100644 --- a/.gitignore +++ b/.gitignore @@ -48,6 +48,7 @@ #!*.txt !*.yml !*.yaml +!Snakefile # 3) add a pattern to track the file patterns of section2 even if they are in # subdirectories diff --git a/Snakefile b/Snakefile new file mode 100644 index 0000000..7f04726 --- /dev/null +++ b/Snakefile @@ -0,0 +1,105 @@ +configfile: 'config.yaml' + +rule all: + input: + config["outdir"] + "/msprime.vcf", + "results/" + config["name"] + ".vcf", + "results/graph.gfa" + +rule fai: + input: + fa = config["fa"] + output: + fai = config["fa"] + ".fai" + container: + "docker://staphb/samtools:1.18" + shell: + "samtools faidx {input.fa} --fai-idx {output.fai}" + +### msprime coalescent generation +rule msprime: + input: + fai = rules.fai.output + output: + msvcf = temp(config["outdir"] + "/msprime_simulation.vcf"), + header = temp(config["outdir"] + "/header.vcf"), + params: + population = config["population_size"], + mut = config["mutation_rate"], + size = config["sample_size"], + outdir = config["outdir"] + container: + "../mspv.sif" + shell: + "python3 /mspv/tree_generation.py -fai {input.fai} -p {params.population} -r {params.mut} -n {params.size} -o {params.outdir}" + +rule msprime_vcf: + input: + vcf = rules.msprime.output.msvcf, + header = rules.msprime.output.header + output: + config["outdir"] + "/msprime.vcf" + container: + "../mspv.sif" + shell: + "cat {input.header} {input.vcf} > {output}" + +rule bed_vcf: + input: + msvcf = rules.msprime.output.msvcf, + fa = config["fa"], + fai = rules.fai.output, + yaml = config["yaml"] + output: + vcf = "results/" + config["name"] + ".vcf", + params: + out = config["name"] + container: + "../mspv_jammy.sif" + shell: + "python3 /mspv/variants_generation.py --vcf {input.msvcf} --fasta {input.fa} --fai {input.fai} -y {input.yaml} -o {params.out}" + +use rule msprime_vcf as final_vcf with: + input: + vcf = rules.bed_vcf.output, + header = rules.msprime.output.header + output: + "results/test_final.vcf" + + +rule graph: + input: + vcf = rules.final_vcf.output, + fa = config["fa"] + output: + graph = "results/index.giraffe.gbz" + container: + "docker://quay.io/vgteam/vg:v1.52.0" + shell: + "vg autoindex -r {input.fa} -v {input.vcf} -w giraffe" + +rule graph_gfa: + input: + rules.graph.output + output: + graph = "results/graph.gfa" + container: + "docker://quay.io/vgteam/vg:v1.52.0" + shell: + "vg convert -f {input} > {output.graph}" + +# rule genomes: +# input: +# graph = rules.output.graph +# output: +# dir("genomes") +# container: +# "vg" +# shell: +# "vg" + +# $IMG vg paths -M -x index.giraffe.gbz +# ## convertir les haplotypes en FASTA (donne un multifasta) +# $IMG vg paths -F -x index.giraffe.gbz > paths_index.fa +# ## convertir le graphe au format GFA +# $IMG vg convert -f index.giraffe.gbz > index.giraffe.gfa \ No newline at end of file -- GitLab From 94af9ad98d1e5397aa748d93c5b2f0f402b861d5 Mon Sep 17 00:00:00 2001 From: "sukanya.denni" <sukanya.denni@inrae.fr> Date: Tue, 21 Nov 2023 16:33:42 +0100 Subject: [PATCH 17/25] changed df use --- bed2vcf.py | 56 +++++++++++++++++++++++++++++++++--------------------- 1 file changed, 34 insertions(+), 22 deletions(-) diff --git a/bed2vcf.py b/bed2vcf.py index 313941f..12eac1c 100644 --- a/bed2vcf.py +++ b/bed2vcf.py @@ -34,9 +34,9 @@ def deletion(ref_seq, alt_seq): alt = str(ref_seq) return(ref,alt) -def set_ref_alt(ref, alt, row, vcf_df): - vcf_df.at[row, "REF"] = ref - vcf_df.at[row, "ALT"] = alt +def set_ref_alt(ref, alt, row, df): + df.at[row, "REF"] = ref + df.at[row, "ALT"] = alt def get_random_len(svtype): # INS, DEL, DUP, CNV, INV @@ -54,18 +54,27 @@ def get_random_len(svtype): # bed_def : VISOR BED # fa_dict : FASTA where variants will be generated def get_seq(vcf_df, bed_df, fa_dict, output_file): + cols = ["CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT"] + ncol = len(vcf_df.columns) - len(cols) + li = [i for i in range(ncol)] + cols.extend(li) + vcf_df.columns = cols + + df = vcf_df.join(bed_df) # number of variants - n = len(vcf_df) + n = len(df) + # add variants + series = [] for i in range(n): - sv_info = bed_df.iloc[i] ## nom du chr pris dans le VCF - chr_name = vcf_df.iloc[i][0] + chr_name = df["CHROM"].iloc[i] ## position du variant pris dans le VCF - start = vcf_df.iloc[i][1] - start = start-1 # pour ajuster à l'index python + start = df["POS"].iloc[i] + start = int(start)-1 # pour ajuster à l'index python ## type de SV à générer - sv_type = sv_info[0] + sv_info = df['SVINFO'].iloc[i] + sv_type = df['SVTYPE'].iloc[i] fasta_seq = fa_dict[chr_name].upper() @@ -93,7 +102,7 @@ def get_seq(vcf_df, bed_df, fa_dict, output_file): end = start + get_random_len("DUP") ref = str(fasta_seq.seq[start]) alt_seq1 = str(fasta_seq.seq[start+1:end]) - cp = int(sv_info[1])-1 + cp = sv_info-1 # multiplication pour obtenir les copies alt_seq = alt_seq1*cp if sv_type == "inverted tandem duplication": @@ -107,9 +116,8 @@ def get_seq(vcf_df, bed_df, fa_dict, output_file): end = start + l # informations sur la translocation - trans_info = sv_info[1] - trans_chr = trans_info[0] - trans_start = trans_info[1]-1 + trans_chr = sv_info[0] + trans_start = sv_info[1]-1 trans_end = trans_start + l fasta_seq_trans = fa_dict[trans_chr].upper() @@ -118,12 +126,12 @@ def get_seq(vcf_df, bed_df, fa_dict, output_file): ref = str(fasta_seq.seq[start:end]) ref2 = str(fasta_seq_trans.seq[trans_start:trans_end]) - if trans_info[2] == "reverse": + if sv_info[2] == "reverse": alt = str(fasta_seq.seq[start]) + reverse(str(fasta_seq_trans.seq[trans_start+1:trans_end])) else : alt = str(fasta_seq.seq[start]) + str(fasta_seq_trans.seq[trans_start+1:trans_end]) - if trans_info[3] == "reverse": + if sv_info[3] == "reverse": alt2 = str(fasta_seq_trans.seq[trans_start]) + reverse(str(fasta_seq.seq[start+1:end])) else: alt2 = str(fasta_seq_trans.seq[trans_start]) + str(fasta_seq.seq[start+1:end]) @@ -136,7 +144,7 @@ def get_seq(vcf_df, bed_df, fa_dict, output_file): # insertion ref2 = str(fasta_seq_trans.seq[trans_start]) - if trans_info[2] == "reverse": + if sv_info[2] == "reverse": alt2 = ref2 + reverse(str(fasta_seq.seq[start+1:end])) else : alt2 = ref2 + str(fasta_seq.seq[start+1:end]) @@ -144,7 +152,7 @@ def get_seq(vcf_df, bed_df, fa_dict, output_file): # copier-coller else: ref = str(fasta_seq.seq[start]) - alt = vcf_df["ALT"].iloc[i] + alt = SNP(ref) ref2 = str(fasta_seq_trans.seq[trans_start]) alt2 = ref2 + str(fasta_seq.seq[start+1:end]) @@ -154,14 +162,18 @@ def get_seq(vcf_df, bed_df, fa_dict, output_file): new_vcf_var[1] = trans_start+1 new_vcf_var[3] = ref2 new_vcf_var[4] = alt2 - vcf_df.loc[len(vcf_df)] = new_vcf_var + save_series = pd.Series(new_vcf_var, index=cols) + series.append(save_series) + + set_ref_alt(ref, alt, i, df) - set_ref_alt(ref, alt, i, vcf_df) + df_add = pd.DataFrame(series) + df = pd.concat([df, df_add], ignore_index=True) # remove unnecessary columns for VCF - vcf_df["ID"] = "." - vcf_df["QUAL"] = 99 + df["ID"] = "." + df["QUAL"] = 99 # output VCF if not os.path.exists("results"): os.mkdir("results") - vcf_df.to_csv("results/" + output_file + ".vcf", sep="\t", header=False, index=False) \ No newline at end of file + df.to_csv("results/" + output_file + ".vcf", sep="\t", header=False, index=False) \ No newline at end of file -- GitLab From fc69707032eb34358380112b999ac7335f50fc64 Mon Sep 17 00:00:00 2001 From: "sukanya.denni" <sukanya.denni@inrae.fr> Date: Tue, 21 Nov 2023 16:34:31 +0100 Subject: [PATCH 18/25] fixed df too short --- randombed.py | 25 ++++++++----------------- 1 file changed, 8 insertions(+), 17 deletions(-) diff --git a/randombed.py b/randombed.py index ee5fb1b..f92f2bb 100644 --- a/randombed.py +++ b/randombed.py @@ -34,7 +34,6 @@ def select_orientation(li): ## create dataframe with all the variants to simulate def generate_type(nvar, yml, fai): d = read_yaml(yml) - print(d) ## list for variant type and info vartype = [] infos = [] @@ -57,24 +56,16 @@ def generate_type(nvar, yml, fai): info = infos_dict[k] infos.extend([info]*n) vartype.extend([k]*n) - - for_df = {'variant': vartype, 'info': infos} + + if len(vartype) != nvar: + n = nvar - len(vartype) + vartype.extend(["SNP"]*n) + infos.extend([None]*n) + + for_df = {'SVTYPE': vartype, 'SVINFO': infos} df = pd.DataFrame(data = for_df) # shuffle values in dataframe - df = df.sample(frac = 1) + df = df.sample(frac = 1, ignore_index=True) # print(df) return(df) - - -# tandem duplication: 0 -# inverted tandem duplication: 0 -# translocation copy-paste: 0 -# translocation cut-paste : 0 -# reciprocal translocation: 0 - - - -fai = "/home/sukanya/tests/02_data/Sibirica_v1.0.fa.fai" -# select_chr(fai) -generate_type(12, "visor_sv_type.yaml", fai) -- GitLab From 7d1a4b324cf388aaa755964a174a4cee39dea775 Mon Sep 17 00:00:00 2001 From: "sukanya.denni" <sukanya.denni@inrae.fr> Date: Tue, 21 Nov 2023 16:34:52 +0100 Subject: [PATCH 19/25] added vcf sorting --- Snakefile | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/Snakefile b/Snakefile index 7f04726..0581683 100644 --- a/Snakefile +++ b/Snakefile @@ -66,13 +66,20 @@ use rule msprime_vcf as final_vcf with: output: "results/test_final.vcf" +rule sort_vcf: + input: + rules.final_vcf.output + output: + "results/test_final_sorted.vcf" + shell: + "bcftools sort {input} -Oz -o {output}" rule graph: input: - vcf = rules.final_vcf.output, + vcf = rules.sort_vcf.output, fa = config["fa"] output: - graph = "results/index.giraffe.gbz" + graph = "index.giraffe.gbz" container: "docker://quay.io/vgteam/vg:v1.52.0" shell: -- GitLab From 85fab3f030f3cfcb883beb89bbc1fa12858a291e Mon Sep 17 00:00:00 2001 From: "sukanya.denni" <sukanya.denni@inrae.fr> Date: Tue, 21 Nov 2023 16:35:07 +0100 Subject: [PATCH 20/25] removed duplicated file --- VISOR_random_bed/randombed.py | 66 ----------------------------------- 1 file changed, 66 deletions(-) delete mode 100644 VISOR_random_bed/randombed.py diff --git a/VISOR_random_bed/randombed.py b/VISOR_random_bed/randombed.py deleted file mode 100644 index 47bcd96..0000000 --- a/VISOR_random_bed/randombed.py +++ /dev/null @@ -1,66 +0,0 @@ -import pandas as pd -import yaml, random, re - -## read yaml file containing variants informations -def read_yaml(f): - stream = open(f, 'r') - d = yaml.safe_load(stream) - stream.close() - if sum(d.values()) != 100: - raise ValueError("Sum of values must be 100") - return(d) - -infos_dict = {'deletion': None, 'insertion': None, 'inversion': None, 'SNP': None, - 'tandem duplication': 2, 'inverted tandem duplication': 2, - 'translocation copy-paste': 0, 'translocation cut-paste': 0, 'reciprocal translocation': 0} - -## generate info field for translocations -# randomly select chromosome to add sequence to -def select_chr(fai): - df = pd.read_table(fai, header=None, usecols=[0,1], names =["name", "length"]) - names=df["name"].to_list() - chr = random.choice(names) - df = pd.pivot_table(df, columns='name') - len = df.iloc[0][chr] - pos = random.randrange(len) - li = [chr, pos] - return(li) - -# randomly select sequence orientation -def select_orientation(li): - li.append(random.choice(["forward", "reverse"])) - return(li) - -## create dataframe with all the variants to simulate -def generate_type(nvar, yml, fai): - d = read_yaml(yml) - print(d) - ## list for variant type and info - vartype = [] - infos = [] - ## filter dict to keep value != 0 - dict = {k: v for k, v in d.items() if v != 0} - for k in dict: - # generate n variants - perc = dict[k] - n = round(perc*nvar/100) - # add info field - if re.search("translocation", k): - for i in range(n): - info = select_chr(fai) - info = select_orientation(info) - if k == "reciprocal translocation": - info = select_orientation(info) - infos.extend([info]) - i += 1 - else: - info = infos_dict[k] - infos.extend([info]*n) - vartype.extend([k]*n) - - for_df = {'variant': vartype, 'info': infos} - - df = pd.DataFrame(data = for_df) - # shuffle values in dataframe - df = df.sample(frac = 1) - return(df) \ No newline at end of file -- GitLab From 26d7ac8af81ba2d44745faa0bd773ccdd4b142bf Mon Sep 17 00:00:00 2001 From: "sukanya.denni" <sukanya.denni@inrae.fr> Date: Tue, 21 Nov 2023 16:35:23 +0100 Subject: [PATCH 21/25] cleaned up --- variants_generation.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/variants_generation.py b/variants_generation.py index 8012b47..5e86744 100644 --- a/variants_generation.py +++ b/variants_generation.py @@ -6,13 +6,6 @@ from multiprocessing import Process ## n : length of variant def sv_vcf(vcf, fasta, fai, yml, outName): - - # bed_df = replace_bed_col(bed, vcf) - # if header != None: - # vcf_df=read_vcf(vcf, header) - # else: - # vcf_df=read_vcf(vcf) - vcf_df=get_vcf(vcf) nvar = len(vcf_df) bed_df = generate_type(nvar, yml, fai) @@ -20,11 +13,6 @@ def sv_vcf(vcf, fasta, fai, yml, outName): get_seq(vcf_df, bed_df, fa, outName) - # header_file="results/vcf_header.txt" - # content_file = "results/" + outName + ".vcf" - # merged_file="results/" + outName + "_final.vcf" - # merge_final_vcf(header_file, content_file, merged_file) - # parse arguments parser = argparse.ArgumentParser(description='create final VCF') # parser.add_argument('-b','--bed', type=str, required = True, help='BED file of structural variants to include') -- GitLab From ec877336318a27c08d068b32b6fbe075b45ff8c7 Mon Sep 17 00:00:00 2001 From: "sukanya.denni" <sukanya.denni@inrae.fr> Date: Tue, 21 Nov 2023 16:42:29 +0100 Subject: [PATCH 22/25] merged files --- fusion.py | 46 ------------------------------------------ variants_generation.py | 4 ++-- 2 files changed, 2 insertions(+), 48 deletions(-) delete mode 100644 fusion.py diff --git a/fusion.py b/fusion.py deleted file mode 100644 index b66f956..0000000 --- a/fusion.py +++ /dev/null @@ -1,46 +0,0 @@ -# fusionner le VCF de msprime (position, génotype) et le BED de VISOR -# pour obtenir un BED avec tous les variants -# récupérer les variants et les génotypes msprime -import io, os, shutil -import pandas as pd - -# def read_vcf(input_file, header_size=6): -# with open(input_file, "r") as txt: -# t = txt.read() -# txt.close() - -# # write header for later -# if not os.path.exists("results"): -# os.mkdir("results") -# f = open("results/vcf_header.txt", "w") -# f.write(''.join(t.splitlines(keepends=True)[:header_size])) -# f.close() - -# # remove VCF header for dataframe -# t = ''.join(t.splitlines(keepends=True)[header_size-1:]) -# df = pd.read_table(io.StringIO(t)) -# df = df.rename(columns={"#CHROM" : "CHROM"}) -# return(df) - -def get_vcf(input_file): - df = pd.read_table(input_file, sep="\t") - return(df) - -# def read_BED(input_file): -# colnames=["chr", "start", "end", "type","info","breakpoint"] -# df = pd.read_table(input_file, header=None, names=colnames) -# return(df) - -# def replace_bed_col(input_BED, input_VCF): -# vcf = read_vcf(input_VCF) -# bed = read_BED(input_BED) -# if len(bed) == len(vcf): -# bed["chr"] = vcf["CHROM"] -# bed["start"] = vcf["POS"] -# return(bed) - -# def merge_final_vcf(header_file, content_file, merged_file): -# with open(merged_file, 'wb') as outfile: -# for filename in [header_file, content_file]: -# with open(filename, 'rb') as infile: -# shutil.copyfileobj(infile, outfile) \ No newline at end of file diff --git a/variants_generation.py b/variants_generation.py index 5e86744..df622cb 100644 --- a/variants_generation.py +++ b/variants_generation.py @@ -1,12 +1,12 @@ from bed2vcf import read_fa, get_seq from randombed import generate_type -from fusion import * import argparse +import pandas as pd from multiprocessing import Process ## n : length of variant def sv_vcf(vcf, fasta, fai, yml, outName): - vcf_df=get_vcf(vcf) + vcf_df = pd.read_table(vcf, sep="\t") nvar = len(vcf_df) bed_df = generate_type(nvar, yml, fai) fa = read_fa(fasta) -- GitLab From 302c2622527570fc7fa9b9ffbd9da7fae4705660 Mon Sep 17 00:00:00 2001 From: "sukanya.denni" <sukanya.denni@inrae.fr> Date: Tue, 21 Nov 2023 16:53:41 +0100 Subject: [PATCH 23/25] drop cols for vcf --- bed2vcf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bed2vcf.py b/bed2vcf.py index 12eac1c..3d3172c 100644 --- a/bed2vcf.py +++ b/bed2vcf.py @@ -169,6 +169,7 @@ def get_seq(vcf_df, bed_df, fa_dict, output_file): df_add = pd.DataFrame(series) df = pd.concat([df, df_add], ignore_index=True) + df = df.drop(['SVTYPE', 'SVINFO'], axis=1) # remove unnecessary columns for VCF df["ID"] = "." -- GitLab From 0083b8a34d99c9a42b95faa71225befbcf99f9ab Mon Sep 17 00:00:00 2001 From: "sukanya.denni" <sukanya.denni@inrae.fr> Date: Mon, 18 Dec 2023 09:49:04 +0100 Subject: [PATCH 24/25] update + python for bed generation --- README.md | 20 ++--- Snakefile | 112 ---------------------------- VISOR_random_bed/visor_sv_type.yaml | 13 +--- randombed.py | 1 + 4 files changed, 16 insertions(+), 130 deletions(-) delete mode 100644 Snakefile diff --git a/README.md b/README.md index 2c1b4be..f6f03d2 100644 --- a/README.md +++ b/README.md @@ -3,23 +3,25 @@ With a population generated by msprime (or any VCF), generate structural variant Help for commands is available: ```bash -python main.py -h -python main.py msprime-vcf -h -python main.py sv-vcf -h +python tree_generation.py -h +python variants_generation.py -h ``` +# ! DOC IS NOT UPDATED + ## 1. Use a VCF as a base A VCF is used as a base for genotypes and the position and number of variants. This VCF can be create with `msprime` to generate a population with: ```bash -python main.py msprime-vcf <FAI> <POPULATION SIZE> <MUTATION RATE> <SAMPLE SIZE> +python tree_generation.py -fai <FAI> -p <POPULATION SIZE> -r <MUTATION RATE> -n <SAMPLE SIZE> ``` This command output as many VCF as there are chromosomes in the reference FAI. ## 2. Generate structural variants -Using the script `VISOR_random_bed/random_bed.sh`, generate a BED with structural variant informations. All structural variant types handled by VISOR are in the `VISOR_random_bed/visor_sv_type.yaml` file. Modify input in the script: -- generate as many variants as there are in the VCF -- choose structural variant types -- choose structural variant proportions (must equal 100) +```bash +python variants_generation.py -fai <FAI> -p <POPULATION SIZE> -r <MUTATION RATE> -n <SAMPLE SIZE> +``` +Modify the YAML input to set the percentage of each variant you want to simulate (must equal 100). The number of variant is the same as in the VCF created with `msprime`. + [How to use the randomregion.r script](https://davidebolo1993.github.io/visordoc/usecases/usecases.html#visor-hack) @@ -39,4 +41,4 @@ The distributions are used to randomly sample each structural variant size. - [ ] tandem repeat contraction - [ ] tandem repeat expansion - [ ] approximate tandem repetition -- [ ] Add VCF merging when there are multiple chromosomes \ No newline at end of file +- [x] Add VCF merging when there are multiple chromosomes \ No newline at end of file diff --git a/Snakefile b/Snakefile deleted file mode 100644 index 0581683..0000000 --- a/Snakefile +++ /dev/null @@ -1,112 +0,0 @@ -configfile: 'config.yaml' - -rule all: - input: - config["outdir"] + "/msprime.vcf", - "results/" + config["name"] + ".vcf", - "results/graph.gfa" - -rule fai: - input: - fa = config["fa"] - output: - fai = config["fa"] + ".fai" - container: - "docker://staphb/samtools:1.18" - shell: - "samtools faidx {input.fa} --fai-idx {output.fai}" - -### msprime coalescent generation -rule msprime: - input: - fai = rules.fai.output - output: - msvcf = temp(config["outdir"] + "/msprime_simulation.vcf"), - header = temp(config["outdir"] + "/header.vcf"), - params: - population = config["population_size"], - mut = config["mutation_rate"], - size = config["sample_size"], - outdir = config["outdir"] - container: - "../mspv.sif" - shell: - "python3 /mspv/tree_generation.py -fai {input.fai} -p {params.population} -r {params.mut} -n {params.size} -o {params.outdir}" - -rule msprime_vcf: - input: - vcf = rules.msprime.output.msvcf, - header = rules.msprime.output.header - output: - config["outdir"] + "/msprime.vcf" - container: - "../mspv.sif" - shell: - "cat {input.header} {input.vcf} > {output}" - -rule bed_vcf: - input: - msvcf = rules.msprime.output.msvcf, - fa = config["fa"], - fai = rules.fai.output, - yaml = config["yaml"] - output: - vcf = "results/" + config["name"] + ".vcf", - params: - out = config["name"] - container: - "../mspv_jammy.sif" - shell: - "python3 /mspv/variants_generation.py --vcf {input.msvcf} --fasta {input.fa} --fai {input.fai} -y {input.yaml} -o {params.out}" - -use rule msprime_vcf as final_vcf with: - input: - vcf = rules.bed_vcf.output, - header = rules.msprime.output.header - output: - "results/test_final.vcf" - -rule sort_vcf: - input: - rules.final_vcf.output - output: - "results/test_final_sorted.vcf" - shell: - "bcftools sort {input} -Oz -o {output}" - -rule graph: - input: - vcf = rules.sort_vcf.output, - fa = config["fa"] - output: - graph = "index.giraffe.gbz" - container: - "docker://quay.io/vgteam/vg:v1.52.0" - shell: - "vg autoindex -r {input.fa} -v {input.vcf} -w giraffe" - -rule graph_gfa: - input: - rules.graph.output - output: - graph = "results/graph.gfa" - container: - "docker://quay.io/vgteam/vg:v1.52.0" - shell: - "vg convert -f {input} > {output.graph}" - -# rule genomes: -# input: -# graph = rules.output.graph -# output: -# dir("genomes") -# container: -# "vg" -# shell: -# "vg" - -# $IMG vg paths -M -x index.giraffe.gbz -# ## convertir les haplotypes en FASTA (donne un multifasta) -# $IMG vg paths -F -x index.giraffe.gbz > paths_index.fa -# ## convertir le graphe au format GFA -# $IMG vg convert -f index.giraffe.gbz > index.giraffe.gfa \ No newline at end of file diff --git a/VISOR_random_bed/visor_sv_type.yaml b/VISOR_random_bed/visor_sv_type.yaml index 557ca6f..518bcaf 100644 --- a/VISOR_random_bed/visor_sv_type.yaml +++ b/VISOR_random_bed/visor_sv_type.yaml @@ -1,15 +1,10 @@ -# type de SV possible dans le script randomregion.r de VISOR +### percentage of each variant type in the simulation, sum of values should be 100 +SNP: 0 deletion: 0 insertion: 0 -inversion: 0 +inversion: 100 tandem duplication: 0 inverted tandem duplication: 0 translocation copy-paste: 0 translocation cut-paste : 0 -reciprocal translocation: 0 -SNP: 0 -# MNP: 0 -# tandem repeat contraction: 0 -# tandem repeat expansion: 0 -# perfect tandem repetition: 0 -# approximate tandem repetition: 0 \ No newline at end of file +reciprocal translocation: 0 \ No newline at end of file diff --git a/randombed.py b/randombed.py index f92f2bb..3f15a71 100644 --- a/randombed.py +++ b/randombed.py @@ -68,4 +68,5 @@ def generate_type(nvar, yml, fai): # shuffle values in dataframe df = df.sample(frac = 1, ignore_index=True) # print(df) + df.to_csv("random_var.tsv", sep="\t", index=False) return(df) -- GitLab From 81428ce618acebbb5a97441b325087b40aa4353c Mon Sep 17 00:00:00 2001 From: "sukanya.denni" <sukanya.denni@inrae.fr> Date: Mon, 18 Dec 2023 09:52:06 +0100 Subject: [PATCH 25/25] update readme --- README.md | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index f6f03d2..67ded0c 100644 --- a/README.md +++ b/README.md @@ -18,18 +18,10 @@ This command output as many VCF as there are chromosomes in the reference FAI. ## 2. Generate structural variants ```bash -python variants_generation.py -fai <FAI> -p <POPULATION SIZE> -r <MUTATION RATE> -n <SAMPLE SIZE> -``` -Modify the YAML input to set the percentage of each variant you want to simulate (must equal 100). The number of variant is the same as in the VCF created with `msprime`. - - -[How to use the randomregion.r script](https://davidebolo1993.github.io/visordoc/usecases/usecases.html#visor-hack) - -## 3. Get the final VCF -The final VCF is obtained by merging the msprime VCF, the VISOR BED and the structural variant sequences. -```bash -python main.py sv-vcf <BED> <VCF> <FASTA> <OUTNAME> +python variants_generation.py -fai <FAI> -fa <FASTA> -v <VCF> -y <YAML> ``` +The VCF is the one created with the previous command using `msprime`. +YAML template is available in `VISOR_random_bed` folder. Modify the YAML input to set the percentage of each variant you want to simulate (must equal 100). Each variant type has a size distribution file (bins = 100 bp) in folder `sv_distributions`. The data was extracted from [An integrated map of structural variation in 2,504 human genomes (Sudmant, et al. 2015)](https://www.nature.com/articles/nature15394). -- GitLab